Pitch Shifting on Blackberry - blackberry

I'm trying to change pitch of some recorded sound using PitchShifter.java class below:
package mypackage;
import net.rim.device.api.util.MathUtilities;
//package com.course.android.voicechanger;
//import android.util.Log;
/****************************************************************************
*
* NAME: PitchShift.cs
* VERSION: 1.2
* HOME URL: http://www.dspdimension.com
* KNOWN BUGS: none
*
* SYNOPSIS: Routine for doing pitch shifting while maintaining
* duration using the Short Time Fourier Transform.
*
* DESCRIPTION: The routine takes a pitchShift factor value which is between 0.5
* (one octave down) and 2. (one octave up). A value of exactly 1 does not change
* the pitch. numSampsToProcess tells the routine how many samples in indata[0...
* numSampsToProcess-1] should be pitch shifted and moved to outdata[0 ...
* numSampsToProcess-1]. The two buffers can be identical (ie. it can process the
* data in-place). fftFrameSize defines the FFT frame size used for the
* processing. Typical values are 1024, 2048 and 4096. It may be any value <=
* MAX_FRAME_LENGTH but it MUST be a power of 2. osamp is the STFT
* oversampling factor which also determines the overlap between adjacent STFT
* frames. It should at least be 4 for moderate scaling ratios. A value of 32 is
* recommended for best quality. sampleRate takes the sample rate for the signal
* in unit Hz, ie. 44100 for 44.1 kHz audio. The data passed to the routine in
* indata[] should be in the range [-1.0, 1.0), which is also the output range
* for the data, make sure you scale the data accordingly (for 16bit signed integers
* you would have to divide (and multiply) by 32768).
*
* COPYRIGHT 1999-2006 Stephan M. Bernsee <smb [AT] dspdimension [DOT] com>
*
* The Wide Open License (WOL)
*
* Permission to use, copy, modify, distribute and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice and this license appear in all source copies.
* THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF
* ANY KIND. See www.dspguru.com/wol.htm for more information.
*
*****************************************************************************/
/****************************************************************************
*
* This code was converted to C# by Michael Knight madmik3 at gmail dot com. sites.google.com/site/mikescoderama/
*
*****************************************************************************/
public class PitchShifter2 {
private static int MAX_FRAME_LENGTH = 16000;
private static float[] gInFIFO = new float[MAX_FRAME_LENGTH];
private static float[] gOutFIFO = new float[MAX_FRAME_LENGTH];
private static float[] gFFTworksp = new float[2 * MAX_FRAME_LENGTH];
private static float[] gLastPhase = new float[MAX_FRAME_LENGTH / 2 + 1];
private static float[] gSumPhase = new float[MAX_FRAME_LENGTH / 2 + 1];
private static float[] gOutputAccum = new float[2 * MAX_FRAME_LENGTH];
private static float[] gAnaFreq = new float[MAX_FRAME_LENGTH];
private static float[] gAnaMagn = new float[MAX_FRAME_LENGTH];
private static float[] gSynFreq = new float[MAX_FRAME_LENGTH];
private static float[] gSynMagn = new float[MAX_FRAME_LENGTH];
private static long gRover, gInit;
public static void PitchShift2(float pitchShift, long numSampsToProcess, float sampleRate, float[] indata) {
// PitchShift2(pitchShift, numSampsToProcess, (long) 256, (long) 10, sampleRate, indata);
PitchShift2(pitchShift, numSampsToProcess, (long) 1024, (long) 32, sampleRate, indata);
}
public static void PitchShift2(float pitchShift, long numSampsToProcess, long fftFrameSize, long osamp, float sampleRate, float[] indata) {
double magn, phase, tmp, window, real, imag;
double freqPerBin, expct;
long i, k, qpd, index, inFifoLatency, stepSize, fftFrameSize2;
float[] outdata = indata;
/* set up some handy variables */
fftFrameSize2 = fftFrameSize / 2;
stepSize = fftFrameSize / osamp;
freqPerBin = sampleRate / (double) fftFrameSize;
expct = 2.0 * Math.PI * (double) stepSize / (double) fftFrameSize;
inFifoLatency = fftFrameSize - stepSize;
if (gRover == 0)
gRover = inFifoLatency;
int c = 0;
int round = 0;
/* main processing loop */
for (i = 0; i < numSampsToProcess; i++) {
/* As long as we have not yet collected enough data just read in */
gInFIFO[(int) gRover] = indata[(int) i];
outdata[(int) i] = gOutFIFO[(int) (gRover - inFifoLatency)];
gRover++;
/* now we have enough data for processing */
if (gRover >= fftFrameSize) {
c++;
if (c > 100) {
// Log.d("Liwei", "round= " + round++);
System.out.println("PitchShifter2.PitchShift(.....................): Liwei" + "round= " + round++);
c = 0;
}
gRover = inFifoLatency;
/* do windowing and re,im interleave */
for (k = 0; k < fftFrameSize; k++) {
window = -.5 * Math.cos(2.0 * Math.PI * (double) k / (double) fftFrameSize) + .5;
gFFTworksp[(int) (2 * k)] = (float) (gInFIFO[(int) k] * window);
gFFTworksp[(int) (2 * k + 1)] = 0.0F;
}
/* ***************** ANALYSIS ******************* */
/* do transform */
ShortTimeFourierTransform(gFFTworksp, fftFrameSize, -1);
/* this is the analysis step */
for (k = 0; k <= fftFrameSize2; k++) {
/* de-interlace FFT buffer */
real = gFFTworksp[(int) (2 * k)];
imag = gFFTworksp[(int) (2 * k + 1)];
/* compute magnitude and phase */
magn = 2.0 * Math.sqrt(real * real + imag * imag);
phase = MathUtilities.atan2(imag, real);
/* compute phase difference */
tmp = phase - gLastPhase[(int) k];
gLastPhase[(int) k] = (float) phase;
/* subtract expected phase difference */
tmp -= (double) k * expct;
/* map delta phase into +/- Pi interval */
qpd = (long) (tmp / Math.PI);
if (qpd >= 0)
qpd += qpd & 1;
else
qpd -= qpd & 1;
tmp -= Math.PI * (double) qpd;
/* get deviation from bin frequency from the +/- Pi interval */
tmp = osamp * tmp / (2.0 * Math.PI);
/* compute the k-th partials' true frequency */
tmp = (double) k * freqPerBin + tmp * freqPerBin;
/* store magnitude and true frequency in analysis arrays */
gAnaMagn[(int) k] = (float) magn;
gAnaFreq[(int) k] = (float) tmp;
}
/* ***************** PROCESSING ******************* */
/* this does the actual pitch shifting */
for (int zero = 0; zero < fftFrameSize; zero++) {
gSynMagn[zero] = 0;
gSynFreq[zero] = 0;
}
for (k = 0; k <= fftFrameSize2; k++) {
index = (long) (k * pitchShift);
if (index <= fftFrameSize2) {
gSynMagn[(int) index] += gAnaMagn[(int) k];
gSynFreq[(int) index] = gAnaFreq[(int) k] * pitchShift;
}
}
/* ***************** SYNTHESIS ******************* */
/* this is the synthesis step */
for (k = 0; k <= fftFrameSize2; k++) {
/* get magnitude and true frequency from synthesis arrays */
magn = gSynMagn[(int) k];
tmp = gSynFreq[(int) k];
/* subtract bin mid frequency */
tmp -= (double) k * freqPerBin;
/* get bin deviation from freq deviation */
tmp /= freqPerBin;
/* take osamp into account */
tmp = 2.0 * Math.PI * tmp / osamp;
/* add the overlap phase advance back in */
tmp += (double) k * expct;
/* accumulate delta phase to get bin phase */
gSumPhase[(int) k] += (float) tmp;
phase = gSumPhase[(int) k];
/* get real and imag part and re-interleave */
gFFTworksp[(int) (2 * k)] = (float) (magn * Math.cos(phase));
gFFTworksp[(int) (2 * k + 1)] = (float) (magn * Math.sin(phase));
}
/* zero negative frequencies */
for (k = fftFrameSize + 2; k < 2 * fftFrameSize; k++)
gFFTworksp[(int) k] = 0.0F;
/* do inverse transform */
ShortTimeFourierTransform(gFFTworksp, fftFrameSize, 1);
/* do windowing and add to output accumulator */
for (k = 0; k < fftFrameSize; k++) {
window = -.5 * Math.cos(2.0 * Math.PI * (double) k / (double) fftFrameSize) + .5;
gOutputAccum[(int) k] += (float) (2.0 * window * gFFTworksp[(int) (2 * k)] / (fftFrameSize2 * osamp));
}
for (k = 0; k < stepSize; k++)
gOutFIFO[(int) k] = gOutputAccum[(int) k];
/* shift accumulator */
// memmove(gOutputAccum, gOutputAccum + stepSize, fftFrameSize *
// sizeof(float));
for (k = 0; k < fftFrameSize; k++) {
gOutputAccum[(int) k] = gOutputAccum[(int) (k + stepSize)];
}
/* move input FIFO */
for (k = 0; k < inFifoLatency; k++)
gInFIFO[(int) k] = gInFIFO[(int) (k + stepSize)];
}
}
}
public static void ShortTimeFourierTransform(float[] fftBuffer, long fftFrameSize, long sign) {
float wr, wi, arg, temp;
float tr, ti, ur, ui;
long i, bitm, j, le, le2, k;
for (i = 2; i < 2 * fftFrameSize - 2; i += 2) {
for (bitm = 2, j = 0; bitm < 2 * fftFrameSize; bitm <<= 1) {
if ((i & bitm) != 0)
j++;
j <<= 1;
}
if (i < j) {
temp = fftBuffer[(int) i];
fftBuffer[(int) i] = fftBuffer[(int) j];
fftBuffer[(int) j] = temp;
temp = fftBuffer[(int) (i + 1)];
fftBuffer[(int) (i + 1)] = fftBuffer[(int) (j + 1)];
fftBuffer[(int) (j + 1)] = temp;
// temp = fftBuffer[i];
// fftBuffer[i] = fftBuffer[j];
// fftBuffer[j] = temp;
// temp = fftBuffer[i + 1];
// fftBuffer[i + 1] = fftBuffer[j + 1];
// fftBuffer[j + 1] = temp;
}
long max = (long) (MathUtilities.log(fftFrameSize) / MathUtilities.log(2.0) + .5);
for (k = 0, le = 2; k < max; k++) {
le <<= 1;
le2 = le >> 1;
ur = 1.0F;
ui = 0.0F;
arg = (float) Math.PI / (le2 >> 1);
wr = (float) Math.cos(arg);
wi = (float) (sign * Math.sin(arg));
for (j = 0; j < le2; j += 2) {
for (i = j; i < 2 * fftFrameSize; i += le) {
tr = fftBuffer[(int) (i + le2)] * ur - fftBuffer[(int) (i + le2 + 1)] * ui;
ti = fftBuffer[(int) (i + le2)] * ui + fftBuffer[(int) (i + le2 + 1)] * ur;
fftBuffer[(int) (i + le2)] = fftBuffer[(int) i] - tr;
fftBuffer[(int) (i + le2 + 1)] = fftBuffer[(int) (i + 1)] - ti;
fftBuffer[(int) i] += tr;
fftBuffer[(int) (i + 1)] += ti;
// tr = fftBuffer[i + le2] * ur - fftBuffer[i + le2 + 1]
// * ui;
// ti = fftBuffer[i + le2] * ui + fftBuffer[i + le2 + 1]
// * ur;
// fftBuffer[i + le2] = fftBuffer[i] - tr;
// fftBuffer[i + le2 + 1] = fftBuffer[i + 1] - ti;
// fftBuffer[i] += tr;
// fftBuffer[i + 1] += ti;
}
tr = ur * wr - ui * wi;
ui = ur * wi + ui * wr;
ur = tr;
}
}
}
}
}
I get byte array from sound file, convert it into float array and pass to PitchShift2() and
after that I convert that float array to byte array, form a stream from byte array and pass it to the player.
but it gives an exception "Unsupported file format".
I have also taken care of Endianness while converting bytes to floats and vise-versa.
can anyone tell me how to use this class properly.

Related

CUDA tiled 2D Convolution in shared memory is slower than global memory

I performed two convolution using constant memory for mask.
One without tiling in global memory:
__global__ void constGradientConvolution(uint8_t* inputImgData, uint8_t* gradientImgData, int w, int h) {
// Calculate the global thread positions
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// Starting index for calculation
int start_r = row - SOBEL_OP_RADIUS;
int start_c = col - SOBEL_OP_RADIUS;
// Temp value for calculation
int temp = 0;
// Iterate over all the rows
for (int i = 0; i < SOBEL_OP_DIM; i++) {
// Go over each column
for (int j = 0; j < SOBEL_OP_DIM; j++) {
// Range check for rows
if ((start_r + i) >= 0 && (start_r + i) < h) {
// Range check for columns
if ((start_c + j) >= 0 && (start_c + j) < w) {
// Accumulate result
temp += inputImgData[(start_r + i) * w + (start_c + j)] *
constMask[i * SOBEL_OP_DIM + j];
}
}
}
}
// Write back the result
gradientImgData[row * w + col] = (uint8_t)abs(temp);
}
and one with tiling, loading in shared memory, credits to https://www.cstechera.com/2015/07/two-dimensional-2d-image-convolution-in-CUDA.html:
__global__ void tiledGradientConvolution(uint8_t* inputImgData, uint8_t* gradientImgData, int width, int height) {
__shared__ uint8_t N_ds[SharedDim_y][SharedDim_x];
// First batch loading
int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
destY = dest / SharedDim_x, destX = dest % SharedDim_x,
srcY = blockIdx.y * TILE_HEIGHT + destY - SOBEL_OP_RADIUS,
srcX = blockIdx.x * TILE_WIDTH + destX - SOBEL_OP_RADIUS,
src = (srcY * width + srcX);
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = inputImgData[src];
else
N_ds[destY][destX] = 0;
for (int iter = 1; iter <= (SharedDim_x * SharedDim_y) / (TILE_WIDTH * TILE_HEIGHT); iter++)
{
// other batch loading
dest = threadIdx.y * TILE_WIDTH + threadIdx.x + iter * (TILE_WIDTH * TILE_HEIGHT);
destY = dest / SharedDim_x, destX = dest % SharedDim_x;
srcY = blockIdx.y * TILE_HEIGHT + destY - SOBEL_OP_RADIUS;
srcX = blockIdx.x * TILE_WIDTH + destX - SOBEL_OP_RADIUS;
src = (srcY * width + srcX);
if (destY < SharedDim_y && destX < SharedDim_x)
{
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = inputImgData[src];
else
N_ds[destY][destX] = 0;
}
}
__syncthreads();
int temp = 0;
int y, x;
for (y = 0; y < SOBEL_OP_DIM; y++)
for (x = 0; x < SOBEL_OP_DIM; x++)
temp += N_ds[threadIdx.y + y][threadIdx.x + x] * constMask[y * SOBEL_OP_DIM + x];
y = blockIdx.y * TILE_HEIGHT + threadIdx.y;
x = blockIdx.x * TILE_WIDTH + threadIdx.x;
if (y < height && x < width) {
gradientImgData[y * width + x] = (uint8_t)abs(temp);
}
__syncthreads();
}
according to nvprof shared memory implementations is slower:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 53.27% 387.52us 2 193.76us 190.70us 196.82us [CUDA memcpy DtoH]
24.28% 176.62us 2 88.311us 608ns 176.01us [CUDA memcpy HtoD]
11.56% 84.102us 1 84.102us 84.102us 84.102us tiledGradientConvolution(unsigned char*, unsigned char*, int, int)
10.90% 79.270us 1 79.270us 79.270us 79.270us constGradientConvolution(unsigned char*, unsigned char*, int, int)
this is the configuration kernel:
#define SOBEL_OP_DIM 3
#define SOBEL_OP_RADIUS (SOBEL_OP_DIM / 2)
// tile dimension
#define TILE_WIDTH 16
#define TILE_HEIGHT 16
// Allocate masks in constant memory
__constant__ int constMask[SOBEL_OP_DIM * SOBEL_OP_DIM];
// Shared Memory Elements needed to be loaded as per Mask Size
#define SharedDim_x (TILE_WIDTH + SOBEL_OP_DIM - 1)
#define SharedDim_y (TILE_HEIGHT + SOBEL_OP_DIM - 1)
// in main code//
dim3 dimBlock(TILE_WIDTH, TILE_HEIGHT);
dim3 dimGrid((test.w + TILE_WIDTH - 1) / TILE_WIDTH, (test.h + TILE_HEIGHT - 1) / TILE_HEIGHT);
I expect shared memory to be faster, but i can figure out what cause conflincts in loading from global memory.
Any help would be appreciated. Thanks you in advance.

How to get XY value from ct in Philips Hue?

How to get XY value from ct.
Ex: ct = 217, I want to get x="0.3127569", y= "0.32908".
I'm able to convert XY value into ct value using this below code.
float R1 = [hue[0] floatValue];
float S1 = [hue[1] floatValue];
float result = ((R1-0.332)/(S1-0.1858));
NSString *ctString = [NSString stringWithFormat:#"%f", ((-449*result*result*result)+(3525*result*result)-(6823.3*result)+(5520.33))];
float micro2 = (float) (1 / [ctString floatValue] * 1000000);
NSString *ctValue = [NSString stringWithFormat:#"%f", micro2];
ctValue = [NSString stringWithFormat:#"%d", [ctValue intValue]];
if ([ctValue integerValue] < 153) {
ctValue = [NSString stringWithFormat:#"%d", 153];
}
Now I want reverse value, which is from ct to XY.
On Phillips HUE
2000K maps to 500 and 6500K maps to 153 given in ct as color temperature but can be thought as actually being Mired.
Mired means micro reciprocal degree wikipedia.
ct is possibly used because it is not 100% Mired. Quite sure Phillips uses a lookup table as a lot CIE algorithms do because there are just 347 indexes in this range from 153 to 500.
The following is not a solution, it's just simple concept of a lookup table.
And as the CIE 1931 xy to CCT Formula by McCamy suggests found here it is possible to use a lookup table to find x and y as well.
A table can be found here but i am not sure if that is the right lookup table.
reminder so the following is not a solution, but to find an reverse algo the code may help.
typedef int Kelvin;
typedef float Mired;
Mired linearMiredByKelvin(Kelvin k) {
if (k==0) return 0;
return 1000000.0/k;
}
-(void)mired {
Mired miredMin = 2000.0/13.0; // 153,84 = reciprocal 6500K
Mired miredMax = 500.0; // 500,00 = reciprocal 2000K
Mired lookupMiredByKelvin[6501]; //max 6500 Kelvin + 1 safe index
//Kelvin lookupKelvinByMired[501]; //max 500 Mired + 1 safe index
// dummy stuff, empty unused table space
for (Kelvin k = 0; k < 2000; k++) {
lookupMiredByKelvin[k] = 0;
}
//for (Mired m = 0.0; m < 154.0; m++) {
// lookupKelvinByMired[(int)m] = 0;
//}
for (Kelvin k=2000; k<6501; k++) {
Mired linearMired = linearMiredByKelvin(k);
float dimm = (linearMired - miredMin) / ( miredMax - miredMin);
Kelvin ct = (Kelvin)(1000000.0/(dimm*miredMax - dimm*miredMin + miredMin));
lookupMiredByKelvin[k] = linearMiredByKelvin(ct);
if (k==2000 || k==2250 || k==2500 || k==2750 ||
k==3000 || k==3250 || k==3500 || k==3750 ||
k==4000 || k==4250 || k==4500 || k==4750 ||
k==5000 || k==5250 || k==5500 || k==5750 ||
k==6000 || k==6250 || k==6500 || k==6501 )
fprintf(stderr,"%d %f %f\n",ct, dimm, lookupMiredByKelvin[k]);
}
}
at least this is proof that x and y will not sit on a simple vector.
CCT means correlated colour temperature and like the implementation in the question shows can be calculated via n= (x-0.3320)/(0.1858-y); CCT = 437*n^3 + 3601*n^2 + 6861*n + 5517. (after McCamy)
but a cct=217 is out of range of above link'ed lookup table.
following the idea in this git-repo from colour-science
and ported to C it could look like..
void CCT_to_xy_CIE_D(float cct) {
//if (CCT < 4000 || CCT > 25000) fprintf(stderr, "Correlated colour temperature must be in domain, unpredictable results may occur! \n");
float x = calculateXviaCCT(cct);
float y = calculateYviaX(x);
NSLog(#"cct=%f x%f y%f",cct,x,y);
}
float calculateXviaCCT(float cct) {
float cct_3 = pow(cct, 3); //(cct*cct*cct);
float cct_2 = pow(cct, 2); //(cct*cct);
if (cct<=7000)
return -4.607 * pow(10, 9) / cct_3 + 2.9678 * pow(10, 6) / cct_2 + 0.09911 * pow(10, 3) / cct + 0.244063;
return -2.0064 * pow(10, 9) / cct_3 + 1.9018 * pow(10, 6) / cct_2 + 0.24748 * pow(10, 3) / cct + 0.23704;
}
float calculateYviaX(float x) {
return -3.000 * pow(x, 2) + 2.870 * x - 0.275;
}
CCT_to_xy_CIE_D(6504.38938305); //proof of concept
//cct=6504.389160 x0.312708 y0.329113
CCT_to_xy_CIE_D(217.0);
//cct=217.000000 x-387.131073 y-450722.750000
// so for sure Phillips hue temperature given in ct between 153-500 is not a good starting point
//but
CCT_to_xy_CIE_D(2000.0);
//cct=2000.000000 x0.459693 y0.410366
this seems to work fine with CCT between 2000 and 25000, but maybe confusing is CCT is given in Kelvin here.
EDIT
This has been through so many revisions and ideas. To keep it simple I edited most of that out and just give you the final result.
This fits your function perfectly except for a region in the middle (temp from 256 to 316) where it deviates a bit.
The problem with your function is that it has approximately infinite solutions, so to solve it nicely you need more constraints, but what? Ol Sen's reference https://www.waveformlighting.com/tech/calculate-color-temperature-cct-from-cie-1931-xy-coordinates discusses it in some detail and then mentions that you want a Duv to be zero. It also gives a way to calculate Duv and so I added that to my optimiser and voila!
Nice and smooth. The optimiser now solves for x and y that both satisfies your function and also minimises Duv.
To get it to work nicely I had to scale Duv quite a bit. That page mentions that Duv should be very small so I think this is a good thing. Also, as the temp increases the scaling should to help the optimiser.
Below prints from 153 to 500.
#import <Foundation/Foundation.h>
// Function taken from your code
// Simplified a bit
int ctFuncI ( float x, float y )
{
// float R1 = [hue[0] floatValue];
// float S1 = [hue[1] floatValue];
float result = (x-0.332)/(y-0.1858);
float cubic = - 449 * result * result * result + 3525 * result * result - 6823.3 * result + 5520.33;
float micro2 = 1 / cubic * 1000000;
int ct = ( int )( micro2 + 0.5 );
if ( ct < 153 )
{
ct = 153;
}
return ct;
}
// Need this
// Float version of your code
float ctFuncF ( float x, float y )
{
// float R1 = [hue[0] floatValue];
// float S1 = [hue[1] floatValue];
float result = (x-0.332)/(y-0.1858);
float cubic = - 449 * result * result * result + 3525 * result * result - 6823.3 * result + 5520.33;
return 1000000 / cubic;
}
// We need an additional constraint
// https://www.waveformlighting.com/tech/calculate-duv-from-cie-1931-xy-coordinates
// Given x, y calculate Duv
// We want this to be 0
float duv ( float x, float y )
{
float f = 1 / ( - 2 * x + 12 * y + 3 );
float u = 4 * x * f;
float v = 6 * y * f;
// I'm typing float but my heart yells double
float k6 = -0.00616793;
float k5 = 0.0893944;
float k4 = -0.5179722;
float k3 = 1.5317403;
float k2 = -2.4243787;
float k1 = 1.925865;
float k0 = -0.471106;
float du = u - 0.292;
float dv = v - 0.24;
float Lfp = sqrt ( du * du + dv * dv );
float a = acos( du / Lfp );
float Lbb = k6 * pow ( a, 6 ) + k5 * pow( a, 5 ) + k4 * pow( a, 4 ) + k3 * pow( a, 3 ) + k2 * pow(a,2) + k1 * a + k0;
return Lfp - Lbb;
}
// Solver!
// Returns iterations
int ctSolve ( int ct, float * x, float * y )
{
int iter = 0;
float dx = 0.001;
float dy = 0.001;
// Error
// Note we scale duv a bit
// Seems the higher the temp, the higher scale we require
// Also note the jump at 255 ...
float s = 1000 * ( ct > 255 ? 10 : 1 );
float d = fabs( ctFuncF ( * x, * y ) - ct ) + s * fabs( duv ( * x, * y ) );
// Approx
while ( d > 0.5 && iter < 250 )
{
iter ++;
dx *= fabs( ctFuncF ( * x + dx, * y ) - ct ) + s * fabs( duv ( * x + dx, * y ) ) < d ? 1.2 : - 0.5;
dy *= fabs( ctFuncF ( * x, * y + dy ) - ct ) + s * fabs( duv ( * x, * y + dy ) ) < d ? 1.2 : - 0.5;
* x += dx;
* y += dy;
d = fabs( ctFuncF ( * x, * y ) - ct ) + s * fabs( duv ( * x, * y ) );
}
return iter;
}
// Tester
int main(int argc, const char * argv[]) {
#autoreleasepool
{
// insert code here...
NSLog(#"Hello, World!");
float x, y;
int sume = 0;
int sumi = 0;
for ( int ct = 153; ct <= 500; ct ++ )
{
// Initial guess
x = 0.4;
y = 0.4;
// Approx
int iter = ctSolve ( ct, & x, & y );
// CT and error
int ctEst = ctFuncI ( x, y );
int e = ct - ctEst;
// Diagnostics
sume += abs ( e );
sumi += iter;
// Print out results
NSLog ( #"want ct = %d x = %f y = %f got ct %d in %d iter error %d", ct, x, y, ctEst, iter, e );
}
NSLog ( #"Sum of abs errors %d iterations %d", sume, sumi );
}
return 0;
}
To use it, do as below.
// To call it, init x and y to some guess
float x = 0.4;
float y = 0.4;
// Then call solver with your temp
int ct = 217;
ctSolve( ct, & x, & y ); // Note you pass references to x and y
// Done, answer now in x and y
a bit more compact answer and functions to convert back and forth..
beware there are rounding issues because McCamy's formula relies and mathematical assumptions. And so the backward calculation does also.
if you want to find more results search directly for "n= (x-0.3320)/(0.1858-y); CCT = 437*n^3 + 3601*n^2 + 6861*n + 5517" there are plenty of different methods to convert back and forth.
so here Phillips-Hue #[#x,#y] to Phillips-ct,Phillips-ct to CCT, CCT to x,y
void CCT_to_xy_CIE_D(float cct) {
//if (CCT < 4000 || CCT > 25000) fprintf(stderr, "Correlated colour temperature must be in domain, unpredictable results may occur! \n");
float x = calculateXviaCCT(cct);
float y = calculateYviaX(x);
fprintf(stderr,"cct=%f x%f y%f",cct,x,y);
}
float calculateXviaCCT(float cct) {
float cct_3 = pow(cct, 3); //(cct*cct*cct);
float cct_2 = pow(cct, 2); //(cct*cct);
if (cct<=7000.0)
return -4.607 * pow(10, 9) / cct_3 + 2.9678 * pow(10, 6) / cct_2 + 0.09911 * pow(10, 3) / cct + 0.244063;
return -2.0064 * pow(10, 9) / cct_3 + 1.9018 * pow(10, 6) / cct_2 + 0.24748 * pow(10, 3) / cct + 0.23704;
}
float calculateYviaX(float x) {
return -3.000 * x*x + 2.870 * x - 0.275;
}
int calculate_PhillipsHueCT_withCCT(float cct) {
if (cct>6500.0) return 2000.0/13.0;
if (cct<2000.0) return 500.0;
//return (float) (1 / cct * 1000000); // same as..
return 1000000 / cct;
}
float calculate_CCT_withPhillipsHueCT(float ct) {
if (ct == 0.0) return 0.0;
return 1000000 / ct;
}
float calculate_CCT_withHueXY(NSArray *hue) {
float x = [hue[0] floatValue]; //R1
float y = [hue[1] floatValue]; //S1
//x = 0.312708; y = 0.329113;
float n = (x-0.3320)/(0.1858-y);
float cct = 437.0*n*n*n + 3601.0*n*n + 6861.0*n + 5517.0;
return cct;
}
// MC Camy formula n=(x-0.3320)/(0.1858-y); cct = 437*n^3 + 3601*n^2 + 6861*n + 5517;
-(void)testPhillipsHueCt_backAndForth {
NSArray *hue = #[#(0.312708),#(0.329113)];
float cct = calculate_CCT_withHueXY(hue);
float ct = calculate_PhillipsHueCT_withCCT(cct);
NSLog(#"ct %f",ct);
CCT_to_xy_CIE_D(cct); // check
CCT_to_xy_CIE_D(6504.38938305); //proof of concept
CCT_to_xy_CIE_D(2000.0);
CCT_to_xy_CIE_D(calculate_CCT_withPhillipsHueCT(217.0));
}

Separable gaussian blur - optimize vertical pass

I have implemented separable Gaussian blur. Horizontal pass was relatively easy to optimize with SIMD processing. However, I am not sure how to optimize vertical pass.
Accessing elements is not very cache friendly and filling SIMD lane would mean reading many different pixels. I was thinking about transpose the image and run horizontal pass and then transpose image back, however, I am not sure if it will gain any improvement because of two tranpose operations.
I have quite large images 16k resolution and kernel size is 19, so vectorization of vertical pass gain was about 15%.
My Vertical pass is as follows (it is sinde generic class typed to T which can be uint8_t or float):
int yStart = kernelHalfSize;
int xStart = kernelHalfSize;
int yEnd = input.GetWidth() - kernelHalfSize;
int xEnd = input.GetHeigh() - kernelHalfSize;
const T * inData = input.GetData().data();
V * outData = output.GetData().data();
int kn = kernelHalfSize * 2 + 1;
int kn4 = kn - kn % 4;
for (int y = yStart; y < yEnd; y++)
{
size_t yW = size_t(y) * output.GetWidth();
size_t outX = size_t(xStart) + yW;
size_t xEndSimd = xStart;
int len = xEnd - xStart;
len = len - len % 4;
xEndSimd = xStart + len;
for (int x = xStart; x < xEndSimd; x += 4)
{
size_t inYW = size_t(y) * input.GetWidth();
size_t x0 = ((x + 0) - kernelHalfSize) + inYW;
size_t x1 = x0 + 1;
size_t x2 = x0 + 2;
size_t x3 = x0 + 3;
__m128 sumDot = _mm_setzero_ps();
int i = 0;
for (; i < kn4; i += 4)
{
__m128 kx = _mm_set_ps1(kernelDataX[i + 0]);
__m128 ky = _mm_set_ps1(kernelDataX[i + 1]);
__m128 kz = _mm_set_ps1(kernelDataX[i + 2]);
__m128 kw = _mm_set_ps1(kernelDataX[i + 3]);
__m128 dx, dy, dz, dw;
if constexpr (std::is_same<T, uint8_t>::value)
{
//we need co convert uint8_t inputs to float
__m128i u8_0 = _mm_loadu_si128((const __m128i*)(inData + x0));
__m128i u8_1 = _mm_loadu_si128((const __m128i*)(inData + x1));
__m128i u8_2 = _mm_loadu_si128((const __m128i*)(inData + x2));
__m128i u8_3 = _mm_loadu_si128((const __m128i*)(inData + x3));
__m128i u32_0 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_0, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_1 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_1, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_2 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_2, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_3 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_3, _mm_setzero_si128()),
_mm_setzero_si128());
dx = _mm_cvtepi32_ps(u32_0);
dy = _mm_cvtepi32_ps(u32_1);
dz = _mm_cvtepi32_ps(u32_2);
dw = _mm_cvtepi32_ps(u32_3);
}
else
{
/*
//load 8 consecutive values
auto dd = _mm256_loadu_ps(inData + x0);
//extract parts by shifting and casting to 4 values float
dx = _mm256_castps256_ps128(dd);
dy = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 4, 3, 2, 1)));
dz = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 5, 4, 3, 2)));
dw = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 6, 5, 4, 3)));
*/
dx = _mm_loadu_ps(inData + x0);
dy = _mm_loadu_ps(inData + x1);
dz = _mm_loadu_ps(inData + x2);
dw = _mm_loadu_ps(inData + x3);
}
//calculate 4 dots at once
//[dx, dy, dz, dw] <dot> [kx, ky, kz, kw]
auto mx = _mm_mul_ps(dx, kx); //dx * kx
auto my = _mm_fmadd_ps(dy, ky, mx); //mx + dy * ky
auto mz = _mm_fmadd_ps(dz, kz, my); //my + dz * kz
auto res = _mm_fmadd_ps(dw, kw, mz); //mz + dw * kw
sumDot = _mm_add_ps(sumDot, res);
x0 += 4;
x1 += 4;
x2 += 4;
x3 += 4;
}
for (; i < kn; i++)
{
auto v = _mm_set_ps1(kernelDataX[i]);
auto v2 = _mm_set_ps(
*(inData + x3), *(inData + x2),
*(inData + x1), *(inData + x0)
);
sumDot = _mm_add_ps(sumDot, _mm_mul_ps(v, v2));
x0++;
x1++;
x2++;
x3++;
}
sumDot = _mm_mul_ps(sumDot, _mm_set_ps1(weightX));
if constexpr (std::is_same<V, uint8_t>::value)
{
__m128i asInt = _mm_cvtps_epi32(sumDot);
asInt = _mm_packus_epi32(asInt, asInt);
asInt = _mm_packus_epi16(asInt, asInt);
uint32_t res = _mm_cvtsi128_si32(asInt);
((uint32_t *)(outData + outX))[0] = res;
outX += 4;
}
else
{
float tmpRes[4];
_mm_store_ps(tmpRes, sumDot);
outData[outX + 0] = tmpRes[0];
outData[outX + 1] = tmpRes[1];
outData[outX + 2] = tmpRes[2];
outData[outX + 3] = tmpRes[3];
outX += 4;
}
}
for (int x = xEndSimd; x < xEnd; x++)
{
int kn = kernelHalfSize * 2 + 1;
const T * v = input.GetPixelStart(x - kernelHalfSize, y);
float tmp = 0;
for (int i = 0; i < kn; i++)
{
tmp += kernelDataX[i] * v[i];
}
tmp *= weightX;
outData[outX] = ImageUtils::clamp_cast<V>(tmp);
outX++;
}
}
There’s a well-known trick for that.
While you compute both passes, read them sequentially, use SIMD to compute, but write out the result into another buffer, transposed, using scalar stores. Protip: SSE 4.1 has _mm_extract_ps just don’t forget to cast your destination image pointer from float* into int*. Another thing about these stores, I would recommend using _mm_stream_si32 for that as you want maximum cache space used by your input data. When you’ll be computing the second pass, you’ll be reading sequential memory addresses again, the prefetcher hardware will deal with the latency.
This way both passes will be identical, I usually call same function twice, with different buffers.
Two transposes caused by your 2 passes cancel each other. Here’s an HLSL version, BTW.
There’s more. If your kernel size is only 19, that fits in 3 AVX registers. I think shuffle/permute/blend instructions are still faster than even L1 cache loads, i.e. it might be better to load the kernel outside the loop.

What is slices in OpenGL?

In the code bellow , Why we need slices ? and what does it for ?
//https://github.com/danginsburg/opengles-book-samples/blob/604a02cc84f9cc4369f7efe93d2a1d7f2cab2ba7/iPhone/Common/esUtil.h#L110
int esGenSphere(int numSlices, float radius, float **vertices,
float **texCoords, uint16_t **indices, int *numVertices_out) {
int numParallels = numSlices / 2;
int numVertices = (numParallels + 1) * (numSlices + 1);
int numIndices = numParallels * numSlices * 6;
float angleStep = (2.0f * ES_PI) / ((float) numSlices);
if (vertices != NULL) {
*vertices = malloc(sizeof(float) * 3 * numVertices);
}
if (texCoords != NULL) {
*texCoords = malloc(sizeof(float) * 2 * numVertices);
}
if (indices != NULL) {
*indices = malloc(sizeof(uint16_t) * numIndices);
}
for (int i = 0; i < numParallels + 1; i++) {
for (int j = 0; j < numSlices + 1; j++) {
int vertex = (i * (numSlices + 1) + j) * 3;
if (vertices) {
(*vertices)[vertex + 0] = radius * sinf(angleStep * (float)i) * sinf(angleStep * (float)j);
(*vertices)[vertex + 1] = radius * cosf(angleStep * (float)i);
(*vertices)[vertex + 2] = radius * sinf(angleStep * (float)i) * cosf(angleStep * (float)j);
}
if (texCoords) {
int texIndex = (i * (numSlices + 1) + j) * 2;
(*texCoords)[texIndex + 0] = (float)j / (float)numSlices;
(*texCoords)[texIndex + 1] = 1.0f - ((float)i / (float)numParallels);
}
}
}
// Generate the indices
if (indices != NULL) {
uint16_t *indexBuf = (*indices);
for (int i = 0; i < numParallels ; i++) {
for (int j = 0; j < numSlices; j++) {
*indexBuf++ = i * (numSlices + 1) + j;
*indexBuf++ = (i + 1) * (numSlices + 1) + j;
*indexBuf++ = (i + 1) * (numSlices + 1) + (j + 1);
*indexBuf++ = i * (numSlices + 1) + j;
*indexBuf++ = (i + 1) * (numSlices + 1) + (j + 1);
*indexBuf++ = i * (numSlices + 1) + (j + 1);
}
}
}
if (numVertices_out) {
*numVertices_out = numVertices;
}
return numIndices;
}
That code generates a sphere mesh that looks like this:
Source: https://commons.wikimedia.org/wiki/File:Sphere_wireframe_10deg_6r.svg CC BY 3.0
As you can see in the picture, there are horizontal parallel lines, and vertical lines which all meet at the poles. The horizontal lines are typically called parallels whereas the vertical ones are called meridians. The author of that code apparently didn't know this term, so they called it "slices" instead.

Multi otsu(multi-thresholding) with openCV

I am trying to carry out multi-thresholding with otsu. The method I am using currently is actually via maximising the between class variance, I have managed to get the same threshold value given as that by the OpenCV library. However, that is just via running otsu method once.
Documentation on how to do multi-level thresholding or rather recursive thresholding is rather limited. Where do I do after obtaining the original otsu's value? Would appreciate some hints, I been playing around with the code, adding one external for loop, but the next value calculated is always 254 for any given image:(
My code if need be:
//compute histogram first
cv::Mat imageh; //image edited to grayscale for histogram purpose
//imageh=image; //to delete and uncomment below;
cv::cvtColor(image, imageh, CV_BGR2GRAY);
int histSize[1] = {256}; // number of bins
float hranges[2] = {0.0, 256.0}; // min andax pixel value
const float* ranges[1] = {hranges};
int channels[1] = {0}; // only 1 channel used
cv::MatND hist;
// Compute histogram
calcHist(&imageh, 1, channels, cv::Mat(), hist, 1, histSize, ranges);
IplImage* im = new IplImage(imageh);//assign the image to an IplImage pointer
IplImage* finalIm = cvCreateImage(cvSize(im->width, im->height), IPL_DEPTH_8U, 1);
double otsuThreshold= cvThreshold(im, finalIm, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU );
cout<<"opencv otsu gives "<<otsuThreshold<<endl;
int totalNumberOfPixels= imageh.total();
cout<<"total number of Pixels is " <<totalNumberOfPixels<< endl;
float sum = 0;
for (int t=0 ; t<256 ; t++)
{
sum += t * hist.at<float>(t);
}
cout<<"sum is "<<sum<<endl;
float sumB = 0; //sum of background
int wB = 0; // weight of background
int wF = 0; //weight of foreground
float varMax = 0;
int threshold = 0;
//run an iteration to find the maximum value of the between class variance(as between class variance shld be maximise)
for (int t=0 ; t<256 ; t++)
{
wB += hist.at<float>(t); // Weight Background
if (wB == 0) continue;
wF = totalNumberOfPixels - wB; // Weight Foreground
if (wF == 0) break;
sumB += (float) (t * hist.at<float>(t));
float mB = sumB / wB; // Mean Background
float mF = (sum - sumB) / wF; // Mean Foreground
// Calculate Between Class Variance
float varBetween = (float)wB * (float)wF * (mB - mF) * (mB - mF);
// Check if new maximum found
if (varBetween > varMax) {
varMax = varBetween;
threshold = t;
}
}
cout<<"threshold value is: "<<threshold;
To extend Otsu's thresholding method to multi-level thresholding the between class variance equation becomes:
Please check out Deng-Yuan Huang, Ta-Wei Lin, Wu-Chih Hu, Automatic
Multilevel Thresholding Based on Two-Stage Otsu's Method with Cluster
Determination by Valley Estimation, Int. Journal of Innovative
Computing, 2011, 7:5631-5644 for more information.
http://www.ijicic.org/ijicic-10-05033.pdf
Here is my C# implementation of Otsu Multi for 2 thresholds:
/* Otsu (1979) - multi */
Tuple < int, int > otsuMulti(object sender, EventArgs e) {
//image histogram
int[] histogram = new int[256];
//total number of pixels
int N = 0;
//accumulate image histogram and total number of pixels
foreach(int intensity in image.Data) {
if (intensity != 0) {
histogram[intensity] += 1;
N++;
}
}
double W0K, W1K, W2K, M0, M1, M2, currVarB, optimalThresh1, optimalThresh2, maxBetweenVar, M0K, M1K, M2K, MT;
optimalThresh1 = 0;
optimalThresh2 = 0;
W0K = 0;
W1K = 0;
M0K = 0;
M1K = 0;
MT = 0;
maxBetweenVar = 0;
for (int k = 0; k <= 255; k++) {
MT += k * (histogram[k] / (double) N);
}
for (int t1 = 0; t1 <= 255; t1++) {
W0K += histogram[t1] / (double) N; //Pi
M0K += t1 * (histogram[t1] / (double) N); //i * Pi
M0 = M0K / W0K; //(i * Pi)/Pi
W1K = 0;
M1K = 0;
for (int t2 = t1 + 1; t2 <= 255; t2++) {
W1K += histogram[t2] / (double) N; //Pi
M1K += t2 * (histogram[t2] / (double) N); //i * Pi
M1 = M1K / W1K; //(i * Pi)/Pi
W2K = 1 - (W0K + W1K);
M2K = MT - (M0K + M1K);
if (W2K <= 0) break;
M2 = M2K / W2K;
currVarB = W0K * (M0 - MT) * (M0 - MT) + W1K * (M1 - MT) * (M1 - MT) + W2K * (M2 - MT) * (M2 - MT);
if (maxBetweenVar < currVarB) {
maxBetweenVar = currVarB;
optimalThresh1 = t1;
optimalThresh2 = t2;
}
}
}
return new Tuple(optimalThresh1, optimalThresh2);
}
And this is the result I got by thresholding an image scan of soil with the above code:
(T1 = 110, T2 = 147).
Otsu's original paper: "Nobuyuki Otsu, A Threshold Selection Method
from Gray-Level Histogram, IEEE Transactions on Systems, Man, and
Cybernetics, 1979, 9:62-66" also briefly mentions the extension to
Multithresholding.
https://engineering.purdue.edu/kak/computervision/ECE661.08/OTSU_paper.pdf
Hope this helps.
Here is a simple general approach for 'n' thresholds in python (>3.0) :
# developed by- SUJOY KUMAR GOSWAMI
# source paper- https://people.ece.cornell.edu/acharya/papers/mlt_thr_img.pdf
import cv2
import numpy as np
import math
img = cv2.imread('path-to-image')
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
a = 0
b = 255
n = 6 # number of thresholds (better choose even value)
k = 0.7 # free variable to take any positive value
T = [] # list which will contain 'n' thresholds
def sujoy(img, a, b):
if a>b:
s=-1
m=-1
return m,s
img = np.array(img)
t1 = (img>=a)
t2 = (img<=b)
X = np.multiply(t1,t2)
Y = np.multiply(img,X)
s = np.sum(X)
m = np.sum(Y)/s
return m,s
for i in range(int(n/2-1)):
img = np.array(img)
t1 = (img>=a)
t2 = (img<=b)
X = np.multiply(t1,t2)
Y = np.multiply(img,X)
mu = np.sum(Y)/np.sum(X)
Z = Y - mu
Z = np.multiply(Z,X)
W = np.multiply(Z,Z)
sigma = math.sqrt(np.sum(W)/np.sum(X))
T1 = mu - k*sigma
T2 = mu + k*sigma
x, y = sujoy(img, a, T1)
w, z = sujoy(img, T2, b)
T.append(x)
T.append(w)
a = T1+1
b = T2-1
k = k*(i+1)
T1 = mu
T2 = mu+1
x, y = sujoy(img, a, T1)
w, z = sujoy(img, T2, b)
T.append(x)
T.append(w)
T.sort()
print(T)
For full paper and more informations visit this link.
I've written an example on how otsu thresholding work in python before. You can see the source code here: https://github.com/subokita/Sandbox/blob/master/otsu.py
In the example there's 2 variants, otsu2() which is the optimised version, as seen on Wikipedia page, and otsu() which is more naive implementation based on the algorithm description itself.
If you are okay in reading python codes (in this case, they are pretty simple, almost pseudo code like), you might want to look at otsu() in the example and modify it. Porting it to C++ code is not hard either.
#Antoni4 gives the best answer in my opinion and it's very straight forward to increase the number of levels.
This is for three-level thresholding:
#include "Shadow01-1.cuh"
void multiThresh(double &optimalThresh1, double &optimalThresh2, double &optimalThresh3, cv::Mat &imgHist, cv::Mat &src)
{
double W0K, W1K, W2K, W3K, M0, M1, M2, M3, currVarB, maxBetweenVar, M0K, M1K, M2K, M3K, MT;
unsigned char *histogram = (unsigned char*)(imgHist.data);
int N = src.rows*src.cols;
W0K = 0;
W1K = 0;
M0K = 0;
M1K = 0;
MT = 0;
maxBetweenVar = 0;
for (int k = 0; k <= 255; k++) {
MT += k * (histogram[k] / (double) N);
}
for (int t1 = 0; t1 <= 255; t1++)
{
W0K += histogram[t1] / (double) N; //Pi
M0K += t1 * (histogram[t1] / (double) N); //i * Pi
M0 = M0K / W0K; //(i * Pi)/Pi
W1K = 0;
M1K = 0;
for (int t2 = t1 + 1; t2 <= 255; t2++)
{
W1K += histogram[t2] / (double) N; //Pi
M1K += t2 * (histogram[t2] / (double) N); //i * Pi
M1 = M1K / W1K; //(i * Pi)/Pi
W2K = 1 - (W0K + W1K);
M2K = MT - (M0K + M1K);
if (W2K <= 0) break;
M2 = M2K / W2K;
W3K = 0;
M3K = 0;
for (int t3 = t2 + 1; t3 <= 255; t3++)
{
W2K += histogram[t3] / (double) N; //Pi
M2K += t3 * (histogram[t3] / (double) N); // i*Pi
M2 = M2K / W2K; //(i*Pi)/Pi
W3K = 1 - (W1K + W2K);
M3K = MT - (M1K + M2K);
M3 = M3K / W3K;
currVarB = W0K * (M0 - MT) * (M0 - MT) + W1K * (M1 - MT) * (M1 - MT) + W2K * (M2 - MT) * (M2 - MT) + W3K * (M3 - MT) * (M3 - MT);
if (maxBetweenVar < currVarB)
{
maxBetweenVar = currVarB;
optimalThresh1 = t1;
optimalThresh2 = t2;
optimalThresh3 = t3;
}
}
}
}
}
#Guilherme Silva
Your code has a BUG
You Must Replace:
W3K = 0;
M3K = 0;
with
W2K = 0;
M2K = 0;
and
W3K = 1 - (W1K + W2K);
M3K = MT - (M1K + M2K);
with
W3K = 1 - (W0K + W1K + W2K);
M3K = MT - (M0K + M1K + M2K);
;-)
Regards
EDIT(1): [Toby Speight]
I discovered this bug by applying the effect to the same picture at different resoultions(Sizes) and seeing that the output results were to much different from each others (Even changing resolution a little bit)
W3K and M3K must be the totals minus the Previous WKs and MKs.
(I thought about this for Code-similarity with the one with one level less)
At the moment due to my lacks of English I cannot explain Better How and Why
To be honest I'm still not 100% sure that this way is correct, even thought from my outputs I could tell that it gives better results. (Even with 1 Level more (5 shades of gray))
You could try yourself ;-)
Sorry
My Outputs:
3 Thresholds
4 Thresholds
I found a useful piece of code in this thread. I was looking for a multi-level Otsu implementation for double/float images. So, I tried to generalize example for N-levels with double/float matrix as input. In my code below I am using armadillo library as dependency. But this code can be easily adapted for standard C++ arrays, just replace vec, uvec objects with single dimensional double and integer arrays, mat and umat with two-dimensional. Two other functions from armadillo used here are: vectorise and hist.
// Input parameters:
// map - input image (double matrix)
// mask - region of interest to be thresholded
// nBins - number of bins
// nLevels - number of Otsu thresholds
#include <armadillo>
#include <algorithm>
#include <vector>
mat OtsuFilterMulti(mat map, int nBins, int nLevels) {
mat mapr; // output thresholded image
mapr = zeros<mat>(map.n_rows, map.n_cols);
unsigned int numElem = 0;
vec threshold = zeros<vec>(nLevels);
vec q = zeros<vec>(nLevels + 1);
vec mu = zeros<vec>(nLevels + 1);
vec muk = zeros<vec>(nLevels + 1);
uvec binv = zeros<uvec>(nLevels);
if (nLevels <= 1) return mapr;
numElem = map.n_rows*map.n_cols;
uvec histogram = hist(vectorise(map), nBins);
double maxval = map.max();
double minval = map.min();
double odelta = (maxval - abs(minval)) / nBins; // distance between histogram bins
vec oval = zeros<vec>(nBins);
double mt = 0, variance = 0.0, bestVariance = 0.0;
for (int ii = 0; ii < nBins; ii++) {
oval(ii) = (double)odelta*ii + (double)odelta*0.5; // centers of histogram bins
mt += (double)ii*((double)histogram(ii)) / (double)numElem;
}
for (int ii = 0; ii < nLevels; ii++) {
binv(ii) = ii;
}
double sq, smuk;
int nComb;
nComb = nCombinations(nBins,nLevels);
std::vector<bool> v(nBins);
std::fill(v.begin(), v.begin() + nLevels, true);
umat ibin = zeros<umat>(nComb, nLevels); // indices from combinations will be stored here
int cc = 0;
int ci = 0;
do {
for (int i = 0; i < nBins; ++i) {
if(ci==nLevels) ci=0;
if (v[i]) {
ibin(cc,ci) = i;
ci++;
}
}
cc++;
} while (std::prev_permutation(v.begin(), v.end()));
uvec lastIndex = zeros<uvec>(nLevels);
// Perform operations on pre-calculated indices
for (int ii = 0; ii < nComb; ii++) {
for (int jj = 0; jj < nLevels; jj++) {
smuk = 0;
sq = 0;
if (lastIndex(jj) != ibin(ii, jj) || ii == 0) {
q(jj) += double(histogram(ibin(ii, jj))) / (double)numElem;
muk(jj) += ibin(ii, jj)*(double(histogram(ibin(ii, jj)))) / (double)numElem;
mu(jj) = muk(jj) / q(jj);
q(jj + 1) = 0.0;
muk(jj + 1) = 0.0;
if (jj>0) {
for (int kk = 0; kk <= jj; kk++) {
sq += q(kk);
smuk += muk(kk);
}
q(jj + 1) = 1 - sq;
muk(jj + 1) = mt - smuk;
mu(jj + 1) = muk(jj + 1) / q(jj + 1);
}
if (jj>0 && jj<(nLevels - 1)) {
q(jj + 1) = 0.0;
muk(jj + 1) = 0.0;
}
lastIndex(jj) = ibin(ii, jj);
}
}
variance = 0.0;
for (int jj = 0; jj <= nLevels; jj++) {
variance += q(jj)*(mu(jj) - mt)*(mu(jj) - mt);
}
if (variance > bestVariance) {
bestVariance = variance;
for (int jj = 0; jj<nLevels; jj++) {
threshold(jj) = oval(ibin(ii, jj));
}
}
}
cout << "Optimized thresholds: ";
for (int jj = 0; jj<nLevels; jj++) {
cout << threshold(jj) << " ";
}
cout << endl;
for (unsigned int jj = 0; jj<map.n_rows; jj++) {
for (unsigned int kk = 0; kk<map.n_cols; kk++) {
for (int ll = 0; ll<nLevels; ll++) {
if (map(jj, kk) >= threshold(ll)) {
mapr(jj, kk) = ll+1;
}
}
}
}
return mapr;
}
int nCombinations(int n, int r) {
if (r>n) return 0;
if (r*2 > n) r = n-r;
if (r == 0) return 1;
int ret = n;
for( int i = 2; i <= r; ++i ) {
ret *= (n-i+1);
ret /= i;
}
return ret;
}

Resources