integrate_adaptive and integrate_times give different answers for negative step size - odeint

I'm using the odeint library in Boost. When using the integrate_adaptive function, the results are as expected. However, when using integrate_times, the ODE is evaluated at very different times that are outside the range of integration. This is a problem for me because my ODE is not defined for some of the values that it is being evaluated at.
The code below demonstrates the issue. The x values for which the ODE is evaluated are printed to the screen.
#include <iostream>
#include <complex>
#include <vector>
#include <boost/numeric/odeint.hpp>
struct observe
{
std::vector<std::vector<std::complex<double> > > & y;
std::vector<double>& x_ode;
observe(std::vector<std::vector<std::complex<double> > > &p_y, std::vector<double> &p_x_ode) : y(p_y), x_ode(p_x_ode) { };
void operator()(const std::vector<std::complex<double> > &y_temp, double x_temp)
{
y.push_back(y_temp);
x_ode.push_back(x_temp);
}
};
class Direct
{
std::complex<double> alpha;
std::complex<double> beta;
std::complex<double> R;
std::vector<std::vector<std::complex<double> > > H0_create(const double y);
public:
Direct(std::complex<double> p_alpha, std::complex<double> p_beta, double p_R) : alpha(p_alpha), beta(p_beta), R(p_R) { }
void operator() (const std::vector<std::complex<double> > &y, std::vector<std::complex<double> > &dydx, const double x)
{
std::vector<std::vector<std::complex<double> > > H0 = H0_create(x);
for(int ii = 0; ii < 6; ii++)
{
dydx[ii] = 0.0;
for(int jj = 0; jj < 6; jj++)
{
dydx[ii] += H0[ii][jj]*y[jj];
}
}
}
};
std::vector<std::vector<std::complex<double> > > Direct::H0_create(const double x)
{
std::complex<double> i = std::complex<double>(0.0,1.0);
std::cout << x << std::endl;
double U = sin(x*3.14159/2.0);
double Ux = cos(x*3.14159/2.0);
std::complex<double> S = alpha*alpha + beta*beta + i*R*alpha*U;
std::vector<std::vector<std::complex<double> > > H0(6);
for(int ii = 0; ii < 6; ii++)
{
H0[ii] = std::vector<std::complex<double> >(6);
}
H0[0][1] = 1.0;
H0[1][0] = S;
H0[1][2] = R*Ux;
H0[1][3] = i*alpha*R;
H0[2][0] = -i*alpha;
H0[2][4] = -i*beta;
H0[3][1] = -i*alpha/R;
H0[3][2] = -S/R;
H0[3][5] = -i*beta/R;
H0[4][5] = 1.0;
H0[5][3] = i*beta*R;
H0[5][4] = S;
return H0;
}
int main()
{
int N = 10;
double x0 = 1.0;
double xf = 0.0;
std::vector<double> x_ode(N);
double delta_x0 = (xf-x0)/(N-1.0);
for(int ii = 0; ii < N; ii++)
{
x_ode[ii] = x0 + ii*delta_x0;
}
x_ode[N-1] = xf;
std::vector<std::vector<std::complex<double> > > y_temp;
std::vector<double> x_temp;
std::complex<double> i = std::complex<double>(0.0,1.0);
std::complex<double> alpha = 0.001*i;
double beta = 0.45;
double R = 500.0;
std::complex<double> lambda = -sqrt(alpha*alpha + beta*beta + i*R*alpha);
// Define Initial Conditions
std::vector<std::complex<double> > ICs = {1, lambda, -i*alpha/lambda,0,0,0};
// Initialize ODE class
Direct direct(alpha,beta,R);
{
using namespace boost::numeric::odeint;
double abs_tol = 1.0e-10;
double rel_tol = 1.0e-6;
std::cout << "integrate_adaptive x values:\n";
size_t steps1 = integrate_adaptive(make_controlled<runge_kutta_cash_karp54<std::vector<std::complex<double> > > >(abs_tol, rel_tol), direct, ICs, x0, xf, delta_x0, observe(y_temp,x_temp));
std::cout << "\n\nintegrate_times x values:\n";
size_t steps2 = integrate_times(make_controlled<runge_kutta_cash_karp54<std::vector<std::complex<double> > > >(abs_tol, rel_tol), direct, ICs, x_ode.begin(), x_ode.end(), delta_x0, observe(y_temp,x_temp));
}
return 0;
}
I am compiling and running by using these commands:
g++ main.cpp -std=C++11; ./a.out
The code produces this output:
integrate_adaptive x values:
1
0.977778
0.966667
0.933333
0.888889
0.902778
0.888889
0.849758
0.830193
0.771496
0.693235
0.717692
0.693235
0.654104
0.634539
0.575842
0.497581
0.522037
0.497581
0.45845
0.438885
0.380188
0.301927
0.326383
0.301927
0.262796
0.24323
0.184534
0.106273
0.130729
0.106273
0.0850181
0.0743908
0.042509
0
0.0132841
integrate_times x values:
1
0.977778
0.966667
0.933333
0.888889
0.902778
0.888889
0.84944
0.829716
0.770543
0.691645
0.716301
0.777778
0.738329
0.718605
0.659432
0.580534
0.60519
0.666667
0.627218
0.607494
0.54832
0.469423
0.494078
0.555556
0.512422
0.490855
0.426154
0.339886
0.366845
0.444444
0.397898
0.374625
0.304806
0.211714
0.240805
0.333333
0.281908
0.256196
0.179058
0.0762077
0.108348
0.222222
0.170797
0.145085
0.0679468
-0.0349035
-0.00276275
0.111111
0.059686
0.0339734
-0.0431643
-0.146015
-0.113874
0.111111
0.0671073
0.0451054
-0.0209003
-0.108908
-0.0814056
The range of integration is from x = 1 to 0 but the ODE is being evaluated at x values less than 0 when using integrate_times.

This is a bug in odeint due to the negative timesteps in your problem, I have created an issue on github:
https://github.com/headmyshoulder/odeint-v2/issues/99
and I have implemented a fix. Please check out the latest odeint version from github and see if the problem remains. If so - feel free to open a new issue on github.
Thanks for pointing out that problem - and sorry for the bug.
Another note: I would suggest to use a dense-output stepper for the integrate_times routine as this is much more efficient (factor 2 in the best case). It basically does what you implemented as a fix above: using adaptive time-steps and interpolates at the intermediate points as required.

Related

Data Clauses (output is zero when i use OpenACC)

I want to reduce runtime of my code by use the OpenACC but unfortunately when i use OpenACC the output becomes zero.
sajad.**
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
#define NX 201
#define NY 101
#define NZ 201
int main(void)
{
int i, j, k, l, m;
static double tr, w;
static double dt = 9.5e-9, t;
static double cu[NZ];
static double AA[NX][NY][NZ] , CC[NX][NY][NZ] , BB[NX][NY][NZ] ;
static double A[NX][NY][NZ] , B[NX][NY][NZ] , C[NX][NY][NZ] ;
FILE *file;
file = fopen("BB-and-A.csv", "w");
t = 0.;
#pragma acc data copyin( tr, w,dt, t),copy(B ,A , C,AA , CC,BB,cu )
{
for (l = 1; l < 65; l++) {
#pragma acc kernels loop private(i, j,k)
for (i = 1; i < NX - 1; i++) {
for (j = 0; j < NY - 1; j++) {
for (k = 1; k < NZ - 1; k++) {
A[i][j][k] = A[i][j][k]
+ 1. * (B[i][j][k] - AA[i][j][k - 1]);
}
}
}
#pragma acc kernels loop private(i, j,k)
for (i = 1; i < NX - 1; i++) { /* BB */
for (j = 1; j < NY - 1; j++) {
for (k = 0; k < NZ - 1; k++) {
B[i][j][k] = B[i][j][k]
+ 1.* (BB[i][j][k] - A[i - 1][j][k]);
}
}
}
#pragma acc kernels
for (m = 1; m < NZ - 1; m++) {
tr = t - (double)(m)*5 / 1.5e8;
if (tr <= 0.)
cu[m] = 0.;
else {
w = (tr / 0.25e-6)*(tr / 0.25e-6);
cu[m] =1666*w / (w + 1.)*exp(-tr / 2.5e-6) ;
cu[m] = 2*cu[m];
}
A[10][60][m] = -cu[m];
}
#pragma acc update self(B)
fprintf(file, "%e, %e \n", t*1e6, -B[22][60][10] );
t = t + dt;
}
}
fclose(file);
}
The problem here is the "copyin( tr, w,dt, t)", and in particular the "t" variable. By putting these scalars in a data clause, you'll need to managed the synchronization between the host as device copies. Hence, when you update the variable on the host (i.e. "t = t + dt;"), you then need to update the device copy with the new value.
Also, there's a potential race condition on "tr" since the device code will now the shared device variable instead of a private copy.
Though, the easiest thing to do is to simply not put these scalars in a data clause. By default, OpenACC privatizes scalars so there's no need manage them yourself. In t's case, it's value will be passed as an argument to the CUDA kernel.
To fix your code change:
#pragma acc data copyin( tr, w,dt, t),copy(B ,A , C,AA , CC,BB,cu )
to:
#pragma acc data copy(B ,A , C,AA , CC,BB,cu )
Note that there's no need to put the loop indices in a private clause since they are implicitly private.

Implementation of LASSO in C

I am trying to understand the LASSO algorithm for linear regression. I have implemented the algorithm using naive coordinate descent method for optimization. However the coefficients that I obtained from my code, wasn't matching with those obtained from the 'glmnet'package for LASSO in R. I wanted to understand how I could make the algorithm more accurate, so that the coefficients match with those obtained from R. I think they use coordinate descent as well.
Note: I have generated some toy data with 11 observations, and 6
features(x,x^2 ,x^3,...,x^6). The last column contains the y values
generated from a dummy function (e^(-x^2)). I wanted to use LASSO to
estimate this function. Also, I have randomly picked the initial
weight vector, multiple times to crosscheck my results.
Here is my code:
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<math.h>
#include<time.h>
int num_dim = 6;
int num_obs = 11;
/*Computes the normalization factor*/
float norm_feature(int j,double arr[][7],int n){
float sum = 0.0;
int i;
for(i=0;i<n;i++){
sum = sum + pow(arr[i][j],2);
}
return sum;
}
/*Computes the partial sum*/
float approx(int dim,int d_ignore,float weights[],double arr[][7],int
i){
int flag = 1;
if(d_ignore == -1)
flag = 0;
int j;
float sum = 0.0;
for(j=0;j<dim;j++){
if(j != d_ignore)
sum = sum + weights[j]*arr[i][j];
else
continue;
}
return sum;
}
/* Computes rho-j */
float rho_j(double arr[][7],int n,int j,float weights[7]){
float sum = 0.0;
int i;
float partial_sum ;
for(i=0;i<n;i++){
partial_sum = approx(num_dim,j,weights,arr,i);
sum = sum + arr[i][j]*(arr[i][num_dim]-partial_sum);
}
return sum;
}
float intercept(float arr1[7],double arr[][7],int dim) {
int i;
float sum =0.0;
for (i = 0; i < num_obs; i++) {
sum = sum + pow((arr[i][num_dim]) - approx(num_dim, -1, arr1, arr,
i), 1);
}
return sum;
}
int main(){
double data[num_obs][7];
int i=0,j=0;
float a = 1.0;
float lambda = 0.1; //Setting lambda
float weights[7]; //weights[6] contains the intercept
srand((unsigned int) time(NULL));
/*Generating the data matrix */
for(i=0;i<11;i++)
data[i][0] = ((float)rand()/(float)(RAND_MAX)) * a;
for(i=0;i<11;i++)
for(j=1;j<6;j++)
data[i][j] = pow(data[i][0],j+1);
for(i=0;i<11;i++)
data[i][6] = exp(-pow(data[i][0],2)); // the last column in the
datamatrix contains the y values generated by the dummy function
/*Printing the data matrix */
printf("Data Matrix:\n");
for(i=0;i<11;i++){
for(j=0;j<7;j++){
printf("%lf ",data[i][j]);}
printf("\n");}
printf("\n");
int seed =0;
while(seed<20) {
//Initializing the weight vector
for (i = 0; i < 7; i++)
weights[i] = ((float) rand() / (float) (RAND_MAX)) * a;
int iter = 500;
int t = 0;
int r, l;
double rho[num_dim];
for (i = 0; i < 6; i++) {
rho[i] = rho_j(data, num_obs, r, weights);
}
// Intercept initialization
weights[num_dim] = intercept(weights,data,num_dim);
printf("Weights initialization: ");
for (i = 0; i < (num_dim+1); i++)
printf("%f ", weights[i]);
printf("\n");
while (t < iter) {
for (r = 0; r < num_dim; r++) {
rho[r] = rho_j(data, num_obs, r, weights);
//printf("rho %d:%f ",r,rho[r]);
if (rho[r] < -lambda / 2)
weights[r] = (rho[r] + lambda / 2) / norm_feature(r,
data, num_obs);
else if (rho[r] > lambda / 2)
weights[r] = (rho[r] - lambda / 2) / norm_feature(r,
data, num_obs);
else
weights[r] = 0;
weights[num_dim] = intercept(weights, data, num_dim);
}
/* printf("Iter(%d): ", t);
for (l = 0; l < 7; l++)
printf("%f ", weights[l]);
printf("\n");*/
t++;
}
//printf("\n");
printf("Final Weights: ");
for (i = 0; i < 7; i++)
printf("%f ", weights[i]);
printf("\n");
printf("\n");
seed++;
}
return 0;
}
PseudoCode:

Opencv - polynomial function fitting

In opencv (or other c++ lib), is there a similar function like matlab fit which can do 3d polynomial surface fitting (i.e. f(x,y)= p00 + p10*x + p01*y + p20*x^2 + p11*x*y + p02*y^2). Thanks
I don't think there is a lib in opencv but you can do like that :
int main( int argc, char** argv )
{
Mat z = imread("1449862093156643.jpg",CV_LOAD_IMAGE_GRAYSCALE);
Mat M = Mat_<double>(z.rows*z.cols,6);
Mat I=Mat_<double>(z.rows*z.cols,1);
for (int i=0;i<z.rows;i++)
for (int j = 0; j < z.cols; j++)
{
double x=(j - z.cols / 2) / double(z.cols),y= (i - z.rows / 2) / double(z.rows);
M.at<double>(i*z.cols+j, 0) = x*x;
M.at<double>(i*z.cols+j, 1) = y*y;
M.at<double>(i*z.cols+j, 2) = x*y;
M.at<double>(i*z.cols+j, 3) = x;
M.at<double>(i*z.cols+j, 4) = y;
M.at<double>(i*z.cols+j, 5) = 1;
I.at<double>(i*z.cols+j, 0) = z.at<uchar>(i,j);
}
SVD s(M);
Mat q;
s.backSubst(I,q);
cout<<q;
imshow("Orignal",z);
cout<<q.at<double>(2,0);
Mat background(z.rows,z.cols,CV_8UC1);
for (int i=0;i<z.rows;i++)
for (int j = 0; j < z.cols; j++)
{
double x=(j - z.cols / 2) / double(z.cols),y= (i - z.rows / 2) / double(z.rows);
double quad=q.at<double>(0,0)*x*x+q.at<double>(1,0)*y*y+q.at<double>(2,0)*x*y;
quad+=q.at<double>(3,0)*x+q.at<double>(4,0)*y+q.at<double>(5,0);
background.at<uchar>(i,j) = saturate_cast<uchar>(quad);
}
imshow("Simulated background",background);
waitKey();
return 0;
}
Original post is here
There is an undocumented function in openCV (contrib.hpp) called cv::polyfit(). It takes as input a Mat of x coordinates and another Mat of y coordinates. Not very easy to use Mats for this but you can build a wrapper for sending a vector of cv::Point points.
vector <float> fitPoly(const vector <Point> &src, int order){
Mat src_x = Mat(src.size(), 1, CV_32F);
Mat src_y = Mat(src.size(), 1, CV_32F);
for (int i = 0; i < src.size(); i++){
src_x.at<float>(i, 0) = (float)src[i].x;
src_y.at<float>(i, 0) = (float)src[i].y;
}
return cv::polyfit(src_x, src_y, order);
}

Multi otsu(multi-thresholding) with openCV

I am trying to carry out multi-thresholding with otsu. The method I am using currently is actually via maximising the between class variance, I have managed to get the same threshold value given as that by the OpenCV library. However, that is just via running otsu method once.
Documentation on how to do multi-level thresholding or rather recursive thresholding is rather limited. Where do I do after obtaining the original otsu's value? Would appreciate some hints, I been playing around with the code, adding one external for loop, but the next value calculated is always 254 for any given image:(
My code if need be:
//compute histogram first
cv::Mat imageh; //image edited to grayscale for histogram purpose
//imageh=image; //to delete and uncomment below;
cv::cvtColor(image, imageh, CV_BGR2GRAY);
int histSize[1] = {256}; // number of bins
float hranges[2] = {0.0, 256.0}; // min andax pixel value
const float* ranges[1] = {hranges};
int channels[1] = {0}; // only 1 channel used
cv::MatND hist;
// Compute histogram
calcHist(&imageh, 1, channels, cv::Mat(), hist, 1, histSize, ranges);
IplImage* im = new IplImage(imageh);//assign the image to an IplImage pointer
IplImage* finalIm = cvCreateImage(cvSize(im->width, im->height), IPL_DEPTH_8U, 1);
double otsuThreshold= cvThreshold(im, finalIm, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU );
cout<<"opencv otsu gives "<<otsuThreshold<<endl;
int totalNumberOfPixels= imageh.total();
cout<<"total number of Pixels is " <<totalNumberOfPixels<< endl;
float sum = 0;
for (int t=0 ; t<256 ; t++)
{
sum += t * hist.at<float>(t);
}
cout<<"sum is "<<sum<<endl;
float sumB = 0; //sum of background
int wB = 0; // weight of background
int wF = 0; //weight of foreground
float varMax = 0;
int threshold = 0;
//run an iteration to find the maximum value of the between class variance(as between class variance shld be maximise)
for (int t=0 ; t<256 ; t++)
{
wB += hist.at<float>(t); // Weight Background
if (wB == 0) continue;
wF = totalNumberOfPixels - wB; // Weight Foreground
if (wF == 0) break;
sumB += (float) (t * hist.at<float>(t));
float mB = sumB / wB; // Mean Background
float mF = (sum - sumB) / wF; // Mean Foreground
// Calculate Between Class Variance
float varBetween = (float)wB * (float)wF * (mB - mF) * (mB - mF);
// Check if new maximum found
if (varBetween > varMax) {
varMax = varBetween;
threshold = t;
}
}
cout<<"threshold value is: "<<threshold;
To extend Otsu's thresholding method to multi-level thresholding the between class variance equation becomes:
Please check out Deng-Yuan Huang, Ta-Wei Lin, Wu-Chih Hu, Automatic
Multilevel Thresholding Based on Two-Stage Otsu's Method with Cluster
Determination by Valley Estimation, Int. Journal of Innovative
Computing, 2011, 7:5631-5644 for more information.
http://www.ijicic.org/ijicic-10-05033.pdf
Here is my C# implementation of Otsu Multi for 2 thresholds:
/* Otsu (1979) - multi */
Tuple < int, int > otsuMulti(object sender, EventArgs e) {
//image histogram
int[] histogram = new int[256];
//total number of pixels
int N = 0;
//accumulate image histogram and total number of pixels
foreach(int intensity in image.Data) {
if (intensity != 0) {
histogram[intensity] += 1;
N++;
}
}
double W0K, W1K, W2K, M0, M1, M2, currVarB, optimalThresh1, optimalThresh2, maxBetweenVar, M0K, M1K, M2K, MT;
optimalThresh1 = 0;
optimalThresh2 = 0;
W0K = 0;
W1K = 0;
M0K = 0;
M1K = 0;
MT = 0;
maxBetweenVar = 0;
for (int k = 0; k <= 255; k++) {
MT += k * (histogram[k] / (double) N);
}
for (int t1 = 0; t1 <= 255; t1++) {
W0K += histogram[t1] / (double) N; //Pi
M0K += t1 * (histogram[t1] / (double) N); //i * Pi
M0 = M0K / W0K; //(i * Pi)/Pi
W1K = 0;
M1K = 0;
for (int t2 = t1 + 1; t2 <= 255; t2++) {
W1K += histogram[t2] / (double) N; //Pi
M1K += t2 * (histogram[t2] / (double) N); //i * Pi
M1 = M1K / W1K; //(i * Pi)/Pi
W2K = 1 - (W0K + W1K);
M2K = MT - (M0K + M1K);
if (W2K <= 0) break;
M2 = M2K / W2K;
currVarB = W0K * (M0 - MT) * (M0 - MT) + W1K * (M1 - MT) * (M1 - MT) + W2K * (M2 - MT) * (M2 - MT);
if (maxBetweenVar < currVarB) {
maxBetweenVar = currVarB;
optimalThresh1 = t1;
optimalThresh2 = t2;
}
}
}
return new Tuple(optimalThresh1, optimalThresh2);
}
And this is the result I got by thresholding an image scan of soil with the above code:
(T1 = 110, T2 = 147).
Otsu's original paper: "Nobuyuki Otsu, A Threshold Selection Method
from Gray-Level Histogram, IEEE Transactions on Systems, Man, and
Cybernetics, 1979, 9:62-66" also briefly mentions the extension to
Multithresholding.
https://engineering.purdue.edu/kak/computervision/ECE661.08/OTSU_paper.pdf
Hope this helps.
Here is a simple general approach for 'n' thresholds in python (>3.0) :
# developed by- SUJOY KUMAR GOSWAMI
# source paper- https://people.ece.cornell.edu/acharya/papers/mlt_thr_img.pdf
import cv2
import numpy as np
import math
img = cv2.imread('path-to-image')
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
a = 0
b = 255
n = 6 # number of thresholds (better choose even value)
k = 0.7 # free variable to take any positive value
T = [] # list which will contain 'n' thresholds
def sujoy(img, a, b):
if a>b:
s=-1
m=-1
return m,s
img = np.array(img)
t1 = (img>=a)
t2 = (img<=b)
X = np.multiply(t1,t2)
Y = np.multiply(img,X)
s = np.sum(X)
m = np.sum(Y)/s
return m,s
for i in range(int(n/2-1)):
img = np.array(img)
t1 = (img>=a)
t2 = (img<=b)
X = np.multiply(t1,t2)
Y = np.multiply(img,X)
mu = np.sum(Y)/np.sum(X)
Z = Y - mu
Z = np.multiply(Z,X)
W = np.multiply(Z,Z)
sigma = math.sqrt(np.sum(W)/np.sum(X))
T1 = mu - k*sigma
T2 = mu + k*sigma
x, y = sujoy(img, a, T1)
w, z = sujoy(img, T2, b)
T.append(x)
T.append(w)
a = T1+1
b = T2-1
k = k*(i+1)
T1 = mu
T2 = mu+1
x, y = sujoy(img, a, T1)
w, z = sujoy(img, T2, b)
T.append(x)
T.append(w)
T.sort()
print(T)
For full paper and more informations visit this link.
I've written an example on how otsu thresholding work in python before. You can see the source code here: https://github.com/subokita/Sandbox/blob/master/otsu.py
In the example there's 2 variants, otsu2() which is the optimised version, as seen on Wikipedia page, and otsu() which is more naive implementation based on the algorithm description itself.
If you are okay in reading python codes (in this case, they are pretty simple, almost pseudo code like), you might want to look at otsu() in the example and modify it. Porting it to C++ code is not hard either.
#Antoni4 gives the best answer in my opinion and it's very straight forward to increase the number of levels.
This is for three-level thresholding:
#include "Shadow01-1.cuh"
void multiThresh(double &optimalThresh1, double &optimalThresh2, double &optimalThresh3, cv::Mat &imgHist, cv::Mat &src)
{
double W0K, W1K, W2K, W3K, M0, M1, M2, M3, currVarB, maxBetweenVar, M0K, M1K, M2K, M3K, MT;
unsigned char *histogram = (unsigned char*)(imgHist.data);
int N = src.rows*src.cols;
W0K = 0;
W1K = 0;
M0K = 0;
M1K = 0;
MT = 0;
maxBetweenVar = 0;
for (int k = 0; k <= 255; k++) {
MT += k * (histogram[k] / (double) N);
}
for (int t1 = 0; t1 <= 255; t1++)
{
W0K += histogram[t1] / (double) N; //Pi
M0K += t1 * (histogram[t1] / (double) N); //i * Pi
M0 = M0K / W0K; //(i * Pi)/Pi
W1K = 0;
M1K = 0;
for (int t2 = t1 + 1; t2 <= 255; t2++)
{
W1K += histogram[t2] / (double) N; //Pi
M1K += t2 * (histogram[t2] / (double) N); //i * Pi
M1 = M1K / W1K; //(i * Pi)/Pi
W2K = 1 - (W0K + W1K);
M2K = MT - (M0K + M1K);
if (W2K <= 0) break;
M2 = M2K / W2K;
W3K = 0;
M3K = 0;
for (int t3 = t2 + 1; t3 <= 255; t3++)
{
W2K += histogram[t3] / (double) N; //Pi
M2K += t3 * (histogram[t3] / (double) N); // i*Pi
M2 = M2K / W2K; //(i*Pi)/Pi
W3K = 1 - (W1K + W2K);
M3K = MT - (M1K + M2K);
M3 = M3K / W3K;
currVarB = W0K * (M0 - MT) * (M0 - MT) + W1K * (M1 - MT) * (M1 - MT) + W2K * (M2 - MT) * (M2 - MT) + W3K * (M3 - MT) * (M3 - MT);
if (maxBetweenVar < currVarB)
{
maxBetweenVar = currVarB;
optimalThresh1 = t1;
optimalThresh2 = t2;
optimalThresh3 = t3;
}
}
}
}
}
#Guilherme Silva
Your code has a BUG
You Must Replace:
W3K = 0;
M3K = 0;
with
W2K = 0;
M2K = 0;
and
W3K = 1 - (W1K + W2K);
M3K = MT - (M1K + M2K);
with
W3K = 1 - (W0K + W1K + W2K);
M3K = MT - (M0K + M1K + M2K);
;-)
Regards
EDIT(1): [Toby Speight]
I discovered this bug by applying the effect to the same picture at different resoultions(Sizes) and seeing that the output results were to much different from each others (Even changing resolution a little bit)
W3K and M3K must be the totals minus the Previous WKs and MKs.
(I thought about this for Code-similarity with the one with one level less)
At the moment due to my lacks of English I cannot explain Better How and Why
To be honest I'm still not 100% sure that this way is correct, even thought from my outputs I could tell that it gives better results. (Even with 1 Level more (5 shades of gray))
You could try yourself ;-)
Sorry
My Outputs:
3 Thresholds
4 Thresholds
I found a useful piece of code in this thread. I was looking for a multi-level Otsu implementation for double/float images. So, I tried to generalize example for N-levels with double/float matrix as input. In my code below I am using armadillo library as dependency. But this code can be easily adapted for standard C++ arrays, just replace vec, uvec objects with single dimensional double and integer arrays, mat and umat with two-dimensional. Two other functions from armadillo used here are: vectorise and hist.
// Input parameters:
// map - input image (double matrix)
// mask - region of interest to be thresholded
// nBins - number of bins
// nLevels - number of Otsu thresholds
#include <armadillo>
#include <algorithm>
#include <vector>
mat OtsuFilterMulti(mat map, int nBins, int nLevels) {
mat mapr; // output thresholded image
mapr = zeros<mat>(map.n_rows, map.n_cols);
unsigned int numElem = 0;
vec threshold = zeros<vec>(nLevels);
vec q = zeros<vec>(nLevels + 1);
vec mu = zeros<vec>(nLevels + 1);
vec muk = zeros<vec>(nLevels + 1);
uvec binv = zeros<uvec>(nLevels);
if (nLevels <= 1) return mapr;
numElem = map.n_rows*map.n_cols;
uvec histogram = hist(vectorise(map), nBins);
double maxval = map.max();
double minval = map.min();
double odelta = (maxval - abs(minval)) / nBins; // distance between histogram bins
vec oval = zeros<vec>(nBins);
double mt = 0, variance = 0.0, bestVariance = 0.0;
for (int ii = 0; ii < nBins; ii++) {
oval(ii) = (double)odelta*ii + (double)odelta*0.5; // centers of histogram bins
mt += (double)ii*((double)histogram(ii)) / (double)numElem;
}
for (int ii = 0; ii < nLevels; ii++) {
binv(ii) = ii;
}
double sq, smuk;
int nComb;
nComb = nCombinations(nBins,nLevels);
std::vector<bool> v(nBins);
std::fill(v.begin(), v.begin() + nLevels, true);
umat ibin = zeros<umat>(nComb, nLevels); // indices from combinations will be stored here
int cc = 0;
int ci = 0;
do {
for (int i = 0; i < nBins; ++i) {
if(ci==nLevels) ci=0;
if (v[i]) {
ibin(cc,ci) = i;
ci++;
}
}
cc++;
} while (std::prev_permutation(v.begin(), v.end()));
uvec lastIndex = zeros<uvec>(nLevels);
// Perform operations on pre-calculated indices
for (int ii = 0; ii < nComb; ii++) {
for (int jj = 0; jj < nLevels; jj++) {
smuk = 0;
sq = 0;
if (lastIndex(jj) != ibin(ii, jj) || ii == 0) {
q(jj) += double(histogram(ibin(ii, jj))) / (double)numElem;
muk(jj) += ibin(ii, jj)*(double(histogram(ibin(ii, jj)))) / (double)numElem;
mu(jj) = muk(jj) / q(jj);
q(jj + 1) = 0.0;
muk(jj + 1) = 0.0;
if (jj>0) {
for (int kk = 0; kk <= jj; kk++) {
sq += q(kk);
smuk += muk(kk);
}
q(jj + 1) = 1 - sq;
muk(jj + 1) = mt - smuk;
mu(jj + 1) = muk(jj + 1) / q(jj + 1);
}
if (jj>0 && jj<(nLevels - 1)) {
q(jj + 1) = 0.0;
muk(jj + 1) = 0.0;
}
lastIndex(jj) = ibin(ii, jj);
}
}
variance = 0.0;
for (int jj = 0; jj <= nLevels; jj++) {
variance += q(jj)*(mu(jj) - mt)*(mu(jj) - mt);
}
if (variance > bestVariance) {
bestVariance = variance;
for (int jj = 0; jj<nLevels; jj++) {
threshold(jj) = oval(ibin(ii, jj));
}
}
}
cout << "Optimized thresholds: ";
for (int jj = 0; jj<nLevels; jj++) {
cout << threshold(jj) << " ";
}
cout << endl;
for (unsigned int jj = 0; jj<map.n_rows; jj++) {
for (unsigned int kk = 0; kk<map.n_cols; kk++) {
for (int ll = 0; ll<nLevels; ll++) {
if (map(jj, kk) >= threshold(ll)) {
mapr(jj, kk) = ll+1;
}
}
}
}
return mapr;
}
int nCombinations(int n, int r) {
if (r>n) return 0;
if (r*2 > n) r = n-r;
if (r == 0) return 1;
int ret = n;
for( int i = 2; i <= r; ++i ) {
ret *= (n-i+1);
ret /= i;
}
return ret;
}

stripes while calculating image gradient with CUDA

I'm writing a code for the image denoising and came across a strange problem with stripes in the processed images. Basically when I'm calculating X-gradient of image the horizontal stripes appear (or vertical for Y direction) Lena X gradient.
The whole algorithm works OK and it looks like I'm getting the correct answer (I'm comparing with program in C) except those annoying stripes Lena result.
The distance between stripes is changing with different block sizes. I'm also having different stripes positions each time I run the program! Here is the part of the program related to the gradient calculation. I have a feeling that I'm doing something very stupid :) Thank you!
#define BLKXSIZE 16
#define BLKYSIZE 16
#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
void Diff4th_GPU(float* A, float* B, int N, int M, int Z, float sigma, int iter, float tau, int type)
{
float *Ad;
dim3 dimBlock(BLKXSIZE,BLKYSIZE);
dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE));
cudaMalloc((void**)&Ad,N*M*sizeof(float));
cudaMemcpy(Ad,A,N*M*sizeof(float),cudaMemcpyHostToDevice);
cudaCheckErrors("cc1");
int n = 1;
while (n <= iter) {
Diff4th2D<<<dimGrid,dimBlock>>>(Ad, N, M, sigma, iter, tau, type);
n++;
cudaDeviceSynchronize();
cudaCheckErrors("kernel");}
cudaMemcpy(B,Ad,N*M*sizeof(float),cudaMemcpyDeviceToHost);
cudaCheckErrors("cc2");
cudaFree(Ad);
}
__global__ void Diff4th2D(float* A, int N, int M, float sigma, int iter, float tau, int type)
{
float gradX, gradX_sq, gradY, gradY_sq, gradXX, gradYY, gradXY, sq_sum, xy_2, Lam, V_norm, V_orth, c, c_sq, lam_t;
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
int index = j + i*N;
if ((i < N) && (j < M))
{
float gradX = 0, gradY = 0, gradXX = 0, gradYY = 0, gradXY = 0;
if ((i>1) && (i<N)) {
if ((j>1) && (j<M)){
int indexN = (j)+(i-1)*(N);
if (indexN > ((N*M)-1)) indexN = (N*M)-1;
if (indexN < 0) indexN = 0;
int indexS = (j)+(i+1)*(N);
if (indexS > ((N*M)-1)) indexS = (N*M)-1;
if (indexS < 0) indexS = 0;
int indexW = (j-1)+(i)*(N);
if (indexW > ((N*M)-1)) indexW = (N*M)-1;
if (indexW < 0) indexW = 0;
int indexE = (j+1)+(i)*(N);
if (indexE > ((N*M)-1)) indexE = (N*M)-1;
if (indexE < 0) indexE = 0;
gradX = 0.5*(A[indexN]-A[indexS]);
A[index] = gradX;
}
}
}
}
You have a race condition inside your kernel, as elements of A may or may not be overwritten before they are used.
Use different arrays for input and output.

Resources