CUDA tiled 2D Convolution in shared memory is slower than global memory

CUDA tiled 2D Convolution in shared memory is slower than global memory - image-processing

I performed two convolution using constant memory for mask.
One without tiling in global memory:
__global__ void constGradientConvolution(uint8_t* inputImgData, uint8_t* gradientImgData, int w, int h) {
// Calculate the global thread positions
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// Starting index for calculation
int start_r = row - SOBEL_OP_RADIUS;
int start_c = col - SOBEL_OP_RADIUS;
// Temp value for calculation
int temp = 0;
// Iterate over all the rows
for (int i = 0; i < SOBEL_OP_DIM; i++) {
// Go over each column
for (int j = 0; j < SOBEL_OP_DIM; j++) {
// Range check for rows
if ((start_r + i) >= 0 && (start_r + i) < h) {
// Range check for columns
if ((start_c + j) >= 0 && (start_c + j) < w) {
// Accumulate result
temp += inputImgData[(start_r + i) * w + (start_c + j)] *
constMask[i * SOBEL_OP_DIM + j];
}
}
}
}
// Write back the result
gradientImgData[row * w + col] = (uint8_t)abs(temp);
}
and one with tiling, loading in shared memory, credits to https://www.cstechera.com/2015/07/two-dimensional-2d-image-convolution-in-CUDA.html:
__global__ void tiledGradientConvolution(uint8_t* inputImgData, uint8_t* gradientImgData, int width, int height) {
__shared__ uint8_t N_ds[SharedDim_y][SharedDim_x];
// First batch loading
int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
destY = dest / SharedDim_x, destX = dest % SharedDim_x,
srcY = blockIdx.y * TILE_HEIGHT + destY - SOBEL_OP_RADIUS,
srcX = blockIdx.x * TILE_WIDTH + destX - SOBEL_OP_RADIUS,
src = (srcY * width + srcX);
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = inputImgData[src];
else
N_ds[destY][destX] = 0;
for (int iter = 1; iter <= (SharedDim_x * SharedDim_y) / (TILE_WIDTH * TILE_HEIGHT); iter++)
{
// other batch loading
dest = threadIdx.y * TILE_WIDTH + threadIdx.x + iter * (TILE_WIDTH * TILE_HEIGHT);
destY = dest / SharedDim_x, destX = dest % SharedDim_x;
srcY = blockIdx.y * TILE_HEIGHT + destY - SOBEL_OP_RADIUS;
srcX = blockIdx.x * TILE_WIDTH + destX - SOBEL_OP_RADIUS;
src = (srcY * width + srcX);
if (destY < SharedDim_y && destX < SharedDim_x)
{
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = inputImgData[src];
else
N_ds[destY][destX] = 0;
}
}
__syncthreads();
int temp = 0;
int y, x;
for (y = 0; y < SOBEL_OP_DIM; y++)
for (x = 0; x < SOBEL_OP_DIM; x++)
temp += N_ds[threadIdx.y + y][threadIdx.x + x] * constMask[y * SOBEL_OP_DIM + x];
y = blockIdx.y * TILE_HEIGHT + threadIdx.y;
x = blockIdx.x * TILE_WIDTH + threadIdx.x;
if (y < height && x < width) {
gradientImgData[y * width + x] = (uint8_t)abs(temp);
}
__syncthreads();
}
according to nvprof shared memory implementations is slower:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 53.27% 387.52us 2 193.76us 190.70us 196.82us [CUDA memcpy DtoH]
24.28% 176.62us 2 88.311us 608ns 176.01us [CUDA memcpy HtoD]
11.56% 84.102us 1 84.102us 84.102us 84.102us tiledGradientConvolution(unsigned char*, unsigned char*, int, int)
10.90% 79.270us 1 79.270us 79.270us 79.270us constGradientConvolution(unsigned char*, unsigned char*, int, int)
this is the configuration kernel:
#define SOBEL_OP_DIM 3
#define SOBEL_OP_RADIUS (SOBEL_OP_DIM / 2)
// tile dimension
#define TILE_WIDTH 16
#define TILE_HEIGHT 16
// Allocate masks in constant memory
__constant__ int constMask[SOBEL_OP_DIM * SOBEL_OP_DIM];
// Shared Memory Elements needed to be loaded as per Mask Size
#define SharedDim_x (TILE_WIDTH + SOBEL_OP_DIM - 1)
#define SharedDim_y (TILE_HEIGHT + SOBEL_OP_DIM - 1)
// in main code//
dim3 dimBlock(TILE_WIDTH, TILE_HEIGHT);
dim3 dimGrid((test.w + TILE_WIDTH - 1) / TILE_WIDTH, (test.h + TILE_HEIGHT - 1) / TILE_HEIGHT);
I expect shared memory to be faster, but i can figure out what cause conflincts in loading from global memory.
Any help would be appreciated. Thanks you in advance.

Related

Separable gaussian blur - optimize vertical pass

I have implemented separable Gaussian blur. Horizontal pass was relatively easy to optimize with SIMD processing. However, I am not sure how to optimize vertical pass.
Accessing elements is not very cache friendly and filling SIMD lane would mean reading many different pixels. I was thinking about transpose the image and run horizontal pass and then transpose image back, however, I am not sure if it will gain any improvement because of two tranpose operations.
I have quite large images 16k resolution and kernel size is 19, so vectorization of vertical pass gain was about 15%.
My Vertical pass is as follows (it is sinde generic class typed to T which can be uint8_t or float):
int yStart = kernelHalfSize;
int xStart = kernelHalfSize;
int yEnd = input.GetWidth() - kernelHalfSize;
int xEnd = input.GetHeigh() - kernelHalfSize;
const T * inData = input.GetData().data();
V * outData = output.GetData().data();
int kn = kernelHalfSize * 2 + 1;
int kn4 = kn - kn % 4;
for (int y = yStart; y < yEnd; y++)
{
size_t yW = size_t(y) * output.GetWidth();
size_t outX = size_t(xStart) + yW;
size_t xEndSimd = xStart;
int len = xEnd - xStart;
len = len - len % 4;
xEndSimd = xStart + len;
for (int x = xStart; x < xEndSimd; x += 4)
{
size_t inYW = size_t(y) * input.GetWidth();
size_t x0 = ((x + 0) - kernelHalfSize) + inYW;
size_t x1 = x0 + 1;
size_t x2 = x0 + 2;
size_t x3 = x0 + 3;
__m128 sumDot = _mm_setzero_ps();
int i = 0;
for (; i < kn4; i += 4)
{
__m128 kx = _mm_set_ps1(kernelDataX[i + 0]);
__m128 ky = _mm_set_ps1(kernelDataX[i + 1]);
__m128 kz = _mm_set_ps1(kernelDataX[i + 2]);
__m128 kw = _mm_set_ps1(kernelDataX[i + 3]);
__m128 dx, dy, dz, dw;
if constexpr (std::is_same<T, uint8_t>::value)
{
//we need co convert uint8_t inputs to float
__m128i u8_0 = _mm_loadu_si128((const __m128i*)(inData + x0));
__m128i u8_1 = _mm_loadu_si128((const __m128i*)(inData + x1));
__m128i u8_2 = _mm_loadu_si128((const __m128i*)(inData + x2));
__m128i u8_3 = _mm_loadu_si128((const __m128i*)(inData + x3));
__m128i u32_0 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_0, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_1 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_1, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_2 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_2, _mm_setzero_si128()),
_mm_setzero_si128());
__m128i u32_3 = _mm_unpacklo_epi16(
_mm_unpacklo_epi8(u8_3, _mm_setzero_si128()),
_mm_setzero_si128());
dx = _mm_cvtepi32_ps(u32_0);
dy = _mm_cvtepi32_ps(u32_1);
dz = _mm_cvtepi32_ps(u32_2);
dw = _mm_cvtepi32_ps(u32_3);
}
else
{
/*
//load 8 consecutive values
auto dd = _mm256_loadu_ps(inData + x0);
//extract parts by shifting and casting to 4 values float
dx = _mm256_castps256_ps128(dd);
dy = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 4, 3, 2, 1)));
dz = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 5, 4, 3, 2)));
dw = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 6, 5, 4, 3)));
*/
dx = _mm_loadu_ps(inData + x0);
dy = _mm_loadu_ps(inData + x1);
dz = _mm_loadu_ps(inData + x2);
dw = _mm_loadu_ps(inData + x3);
}
//calculate 4 dots at once
//[dx, dy, dz, dw] <dot> [kx, ky, kz, kw]
auto mx = _mm_mul_ps(dx, kx); //dx * kx
auto my = _mm_fmadd_ps(dy, ky, mx); //mx + dy * ky
auto mz = _mm_fmadd_ps(dz, kz, my); //my + dz * kz
auto res = _mm_fmadd_ps(dw, kw, mz); //mz + dw * kw
sumDot = _mm_add_ps(sumDot, res);
x0 += 4;
x1 += 4;
x2 += 4;
x3 += 4;
}
for (; i < kn; i++)
{
auto v = _mm_set_ps1(kernelDataX[i]);
auto v2 = _mm_set_ps(
*(inData + x3), *(inData + x2),
*(inData + x1), *(inData + x0)
);
sumDot = _mm_add_ps(sumDot, _mm_mul_ps(v, v2));
x0++;
x1++;
x2++;
x3++;
}
sumDot = _mm_mul_ps(sumDot, _mm_set_ps1(weightX));
if constexpr (std::is_same<V, uint8_t>::value)
{
__m128i asInt = _mm_cvtps_epi32(sumDot);
asInt = _mm_packus_epi32(asInt, asInt);
asInt = _mm_packus_epi16(asInt, asInt);
uint32_t res = _mm_cvtsi128_si32(asInt);
((uint32_t *)(outData + outX))[0] = res;
outX += 4;
}
else
{
float tmpRes[4];
_mm_store_ps(tmpRes, sumDot);
outData[outX + 0] = tmpRes[0];
outData[outX + 1] = tmpRes[1];
outData[outX + 2] = tmpRes[2];
outData[outX + 3] = tmpRes[3];
outX += 4;
}
}
for (int x = xEndSimd; x < xEnd; x++)
{
int kn = kernelHalfSize * 2 + 1;
const T * v = input.GetPixelStart(x - kernelHalfSize, y);
float tmp = 0;
for (int i = 0; i < kn; i++)
{
tmp += kernelDataX[i] * v[i];
}
tmp *= weightX;
outData[outX] = ImageUtils::clamp_cast<V>(tmp);
outX++;
}
}

There’s a well-known trick for that.
While you compute both passes, read them sequentially, use SIMD to compute, but write out the result into another buffer, transposed, using scalar stores. Protip: SSE 4.1 has _mm_extract_ps just don’t forget to cast your destination image pointer from float* into int*. Another thing about these stores, I would recommend using _mm_stream_si32 for that as you want maximum cache space used by your input data. When you’ll be computing the second pass, you’ll be reading sequential memory addresses again, the prefetcher hardware will deal with the latency.
This way both passes will be identical, I usually call same function twice, with different buffers.
Two transposes caused by your 2 passes cancel each other. Here’s an HLSL version, BTW.
There’s more. If your kernel size is only 19, that fits in 3 AVX registers. I think shuffle/permute/blend instructions are still faster than even L1 cache loads, i.e. it might be better to load the kernel outside the loop.

Resize image using nearest neighborhood with cuda

I am implementing a nearest neighborhood kernel function to resize the input image. But the result is wrong and I have no idea.
Here is the input image
the result is wrong.
I use opencv to read the input image.
cv::Mat image = cv::imread("/home/tumh/test.jpg");
unsigned char* data = image.data;
int outH, outW;
float *out_data_host = test(data, image.rows, image.cols, outH, outW);
cv::Mat out_image(outH, outW, CV_32FC3);
memcpy(out_image.data, out_data_host, outH * outW * 3 * sizeof(float));
float* test(unsigned char* in_data_host, const int &inH, const int &inW, int &outH, int &outW) {
// get the output size
int im_size_min = std::min(inW, inH);
int im_size_max = std::max(inW, inH);
float scale_factor = static_cast<float>(640) / im_size_min;
float im_scale_x = std::floor(inW * scale_factor / 64) * 64 / inW;
float im_scale_y = std::floor(inH * scale_factor / 64) * 64 / inH;
outW = inW * im_scale_x;
outH = inH * im_scale_y;
int channel = 3;
unsigned char* in_data_dev;
CUDA_CHECK(cudaMalloc(&in_data_dev, sizeof(unsigned char) * channel * inH * inW));
CUDA_CHECK(cudaMemcpy(in_data_dev, in_data_host, 1 * sizeof(unsigned char) * channel * inH * inW, cudaMemcpyHostToDevice));
// image pre process
const float2 scale = make_float2( im_scale_x, im_scale_y);
float * out_buffer = NULL;
CUDA_CHECK(cudaMalloc(&out_buffer, sizeof(float) * channel * outH * outW));
float *out_data_host = new float[sizeof(float) * channel * outH * outW];
const dim3 threads(32, 32);
const dim3 block(iDivUp(outW, threads.x), iDivUp(outW, threads.y));
gpuPreImageNet<<<block, threads>>>(scale, in_data_dev, inW, out_buffer, outW, outH);
CUDA_CHECK(cudaFree(in_data_dev));
CUDA_CHECK(cudaMemcpy(out_data_host, out_buffer, sizeof(float) * channel * outH * outW, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(out_buffer));
return out_data_host;
}
Here is the resize kernel function
__global__ void gpuPreImageNet( float2 scale, unsigned char* input, int iWidth, float* output, int oWidth, int oHeight )
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int n = oWidth * oHeight;
int channel = 3;
if( x >= oWidth || y >= oHeight )
return;
const int dx = ((float)x * scale.x);
const int dy = ((float)y * scale.y);
const unsigned char* px = input + dy * iWidth * channel + dx * channel ;
const float3 bgr = make_float3(*(px + 0), *(px + 1), *(px + 2));
output[channel * y * oWidth + channel * x + 0] = bgr.x;
output[channel * y * oWidth + channel * x + 1] = bgr.y;
output[channel * y * oWidth + channel * x + 2] = bgr.z;
}
Most of the implementation is from https://github.com/soulsheng/ResizeNN/blob/master/resizeCUDA/resizeNN.cu
Any idea?

Maybe you are observing an uninitialized memory problem.
As i understand your code, out_data_host allocation is too big
new float[sizeof(float) * channel * outH * outW];
should be
new float[channel * outH * outW]
Then out_buffer is uninitialized, add a cudaMemset after the cudaMalloc line.
To clarify your code, since you already use OpenCV to load images, why don't you use opencv to resize your images ?
cv::resize // Host side method is probably better since you'll have less data copied through PCI-Express
// or
cv::cuda::resize

It took me around two days to figure out a solution for this problem. Basically, I was building a GPU based image preprocessing pipeline for my project. Here's the custom Cuda Kernel.
For Gray scale Image Resizing, change channel from 3 -> 1 and it should work.
__global__ void resize_kernel( real* pIn, real* pOut, int widthIn, int heightIn, int widthOut, int heightOut)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int channel = 3;
if( i < heightOut && j < widthOut )
{
int iIn = i * heightIn / heightOut;
int jIn = j * widthIn / widthOut;
for(int c = 0; c < channel; c++)
pOut[ (i*widthOut + j)*channel + c ] = pIn[ (iIn*widthIn + jIn)*channel + c ];
}
}

What is slices in OpenGL?

In the code bellow , Why we need slices ? and what does it for ?
//https://github.com/danginsburg/opengles-book-samples/blob/604a02cc84f9cc4369f7efe93d2a1d7f2cab2ba7/iPhone/Common/esUtil.h#L110
int esGenSphere(int numSlices, float radius, float **vertices,
float **texCoords, uint16_t **indices, int *numVertices_out) {
int numParallels = numSlices / 2;
int numVertices = (numParallels + 1) * (numSlices + 1);
int numIndices = numParallels * numSlices * 6;
float angleStep = (2.0f * ES_PI) / ((float) numSlices);
if (vertices != NULL) {
*vertices = malloc(sizeof(float) * 3 * numVertices);
}
if (texCoords != NULL) {
*texCoords = malloc(sizeof(float) * 2 * numVertices);
}
if (indices != NULL) {
*indices = malloc(sizeof(uint16_t) * numIndices);
}
for (int i = 0; i < numParallels + 1; i++) {
for (int j = 0; j < numSlices + 1; j++) {
int vertex = (i * (numSlices + 1) + j) * 3;
if (vertices) {
(*vertices)[vertex + 0] = radius * sinf(angleStep * (float)i) * sinf(angleStep * (float)j);
(*vertices)[vertex + 1] = radius * cosf(angleStep * (float)i);
(*vertices)[vertex + 2] = radius * sinf(angleStep * (float)i) * cosf(angleStep * (float)j);
}
if (texCoords) {
int texIndex = (i * (numSlices + 1) + j) * 2;
(*texCoords)[texIndex + 0] = (float)j / (float)numSlices;
(*texCoords)[texIndex + 1] = 1.0f - ((float)i / (float)numParallels);
}
}
}
// Generate the indices
if (indices != NULL) {
uint16_t *indexBuf = (*indices);
for (int i = 0; i < numParallels ; i++) {
for (int j = 0; j < numSlices; j++) {
*indexBuf++ = i * (numSlices + 1) + j;
*indexBuf++ = (i + 1) * (numSlices + 1) + j;
*indexBuf++ = (i + 1) * (numSlices + 1) + (j + 1);
*indexBuf++ = i * (numSlices + 1) + j;
*indexBuf++ = (i + 1) * (numSlices + 1) + (j + 1);
*indexBuf++ = i * (numSlices + 1) + (j + 1);
}
}
}
if (numVertices_out) {
*numVertices_out = numVertices;
}
return numIndices;
}

That code generates a sphere mesh that looks like this:
Source: https://commons.wikimedia.org/wiki/File:Sphere_wireframe_10deg_6r.svg CC BY 3.0
As you can see in the picture, there are horizontal parallel lines, and vertical lines which all meet at the poles. The horizontal lines are typically called parallels whereas the vertical ones are called meridians. The author of that code apparently didn't know this term, so they called it "slices" instead.

How to perform skin tone matching

( face )
( body )
Hi, i am new to image processing and openCV C/C++. I am wondering that is it possible to extract skin tone from the first image (face). And then applied to the second image (body).
In other words, user upload his face image and the program extract the skin tone from that image and apply it to the body.
Thanks,
Aisha

This is a hard problem to solve, especially given the variation of colours depending on lighting and reflection. I have worked previously on finding skin in images, and generally the Cr (chroma red) component of the YCbCr colour space stands out prominently on skin. You might be able to exploit this information to find skin regions.
Here are a couple of papers that attempt to use colour for locating human skin:
1. Interaction between hands and wearable cameras
2. Markerless inspection of augmented reality objects

For finding skin you can use one of this formulas:
1) With normilized RGB space:
for(int i = 0; i < m_image->height; ++i)
{
for(int j = 0; j < m_image->width; ++j)
{
if (m_image->nChannels == 3)
{
int valueR = (reinterpret_cast<uchar*>(m_image->imageData + i * m_image->widthStep))[j * 3 + 2];
int valueG = (reinterpret_cast<uchar*>(m_image->imageData + i * m_image->widthStep))[j * 3 + 1];
int valueB = (reinterpret_cast<uchar*>(m_image->imageData + i * m_image->widthStep))[j * 3];
float normR = static_cast<float>(valueR) / static_cast<float>(valueR + valueG + valueB);
float normG = static_cast<float>(valueG) / static_cast<float>(valueR + valueG + valueB);
float normB = static_cast<float>(valueB) / static_cast<float>(valueR + valueG + valueB);
if ((normB / normG < 1.249) &&
(( normR + normG + normB ) / ( 3 * normR ) > 0.696 ) &&
( 1/3.0 - normB/( normR + normG + normB ) > 0.014 ) &&
(normG/(3* (normR + normG + normB)) < 0.108 ))
{
//pixel is skin
}
}
}
2) in RGB space:
for(size_t i = 0; i < m_image->height; ++i)
{
for(size_t j = 0; j < m_image->width; ++j)
{
if (m_image->nChannels == 3)
{
int R = (reinterpret_cast<uchar*>(m_image->imageData + i * m_image->widthStep))[j * 3 + 2];
int G = (reinterpret_cast<uchar*>(m_image->imageData + i * m_image->widthStep))[j * 3 + 1];
int B = (reinterpret_cast<uchar*>(m_image->imageData + i * m_image->widthStep))[j * 3];
if (( R > 95) && ( G > 40 ) && ( B > 20 ) &&
(std::max(R, std::max( G, B) ) - std::min(R, std::min(G, B) ) > 15) &&
(std::abs(R - G) > 15) && (R > G) && (R > B))
{
//skin pixel
}
}
}
3) in YCrCb space:
for(size_t i = 0; i < m_image->height; ++i)
{
for(size_t j = 0; j < m_image->width; ++j)
{
if (m_image->nChannels == 3)
{
int Cr = (reinterpret_cast<uchar*>(image->imageData + i * image->widthStep))[j * 3 + 2];
int Cb = (reinterpret_cast<uchar*>(image->imageData + i * image->widthStep))[j * 3 + 1];
int Y = (reinterpret_cast<uchar*>(image->imageData + i * image->widthStep))[j * 3];
if (( Y > 80 ) && ( Cb > 85 ) && ( Cb < 135 ) &&
(Cr > 135) && (Cr < 180))
{
//skin pixel
}
}
}
}

Pitch Shifting on Blackberry

I'm trying to change pitch of some recorded sound using PitchShifter.java class below:
package mypackage;
import net.rim.device.api.util.MathUtilities;
//package com.course.android.voicechanger;
//import android.util.Log;
/****************************************************************************
*
* NAME: PitchShift.cs
* VERSION: 1.2
* HOME URL: http://www.dspdimension.com
* KNOWN BUGS: none
*
* SYNOPSIS: Routine for doing pitch shifting while maintaining
* duration using the Short Time Fourier Transform.
*
* DESCRIPTION: The routine takes a pitchShift factor value which is between 0.5
* (one octave down) and 2. (one octave up). A value of exactly 1 does not change
* the pitch. numSampsToProcess tells the routine how many samples in indata[0...
* numSampsToProcess-1] should be pitch shifted and moved to outdata[0 ...
* numSampsToProcess-1]. The two buffers can be identical (ie. it can process the
* data in-place). fftFrameSize defines the FFT frame size used for the
* processing. Typical values are 1024, 2048 and 4096. It may be any value <=
* MAX_FRAME_LENGTH but it MUST be a power of 2. osamp is the STFT
* oversampling factor which also determines the overlap between adjacent STFT
* frames. It should at least be 4 for moderate scaling ratios. A value of 32 is
* recommended for best quality. sampleRate takes the sample rate for the signal
* in unit Hz, ie. 44100 for 44.1 kHz audio. The data passed to the routine in
* indata[] should be in the range [-1.0, 1.0), which is also the output range
* for the data, make sure you scale the data accordingly (for 16bit signed integers
* you would have to divide (and multiply) by 32768).
*
* COPYRIGHT 1999-2006 Stephan M. Bernsee <smb [AT] dspdimension [DOT] com>
*
* The Wide Open License (WOL)
*
* Permission to use, copy, modify, distribute and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice and this license appear in all source copies.
* THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF
* ANY KIND. See www.dspguru.com/wol.htm for more information.
*
*****************************************************************************/
/****************************************************************************
*
* This code was converted to C# by Michael Knight madmik3 at gmail dot com. sites.google.com/site/mikescoderama/
*
*****************************************************************************/
public class PitchShifter2 {
private static int MAX_FRAME_LENGTH = 16000;
private static float[] gInFIFO = new float[MAX_FRAME_LENGTH];
private static float[] gOutFIFO = new float[MAX_FRAME_LENGTH];
private static float[] gFFTworksp = new float[2 * MAX_FRAME_LENGTH];
private static float[] gLastPhase = new float[MAX_FRAME_LENGTH / 2 + 1];
private static float[] gSumPhase = new float[MAX_FRAME_LENGTH / 2 + 1];
private static float[] gOutputAccum = new float[2 * MAX_FRAME_LENGTH];
private static float[] gAnaFreq = new float[MAX_FRAME_LENGTH];
private static float[] gAnaMagn = new float[MAX_FRAME_LENGTH];
private static float[] gSynFreq = new float[MAX_FRAME_LENGTH];
private static float[] gSynMagn = new float[MAX_FRAME_LENGTH];
private static long gRover, gInit;
public static void PitchShift2(float pitchShift, long numSampsToProcess, float sampleRate, float[] indata) {
// PitchShift2(pitchShift, numSampsToProcess, (long) 256, (long) 10, sampleRate, indata);
PitchShift2(pitchShift, numSampsToProcess, (long) 1024, (long) 32, sampleRate, indata);
}
public static void PitchShift2(float pitchShift, long numSampsToProcess, long fftFrameSize, long osamp, float sampleRate, float[] indata) {
double magn, phase, tmp, window, real, imag;
double freqPerBin, expct;
long i, k, qpd, index, inFifoLatency, stepSize, fftFrameSize2;
float[] outdata = indata;
/* set up some handy variables */
fftFrameSize2 = fftFrameSize / 2;
stepSize = fftFrameSize / osamp;
freqPerBin = sampleRate / (double) fftFrameSize;
expct = 2.0 * Math.PI * (double) stepSize / (double) fftFrameSize;
inFifoLatency = fftFrameSize - stepSize;
if (gRover == 0)
gRover = inFifoLatency;
int c = 0;
int round = 0;
/* main processing loop */
for (i = 0; i < numSampsToProcess; i++) {
/* As long as we have not yet collected enough data just read in */
gInFIFO[(int) gRover] = indata[(int) i];
outdata[(int) i] = gOutFIFO[(int) (gRover - inFifoLatency)];
gRover++;
/* now we have enough data for processing */
if (gRover >= fftFrameSize) {
c++;
if (c > 100) {
// Log.d("Liwei", "round= " + round++);
System.out.println("PitchShifter2.PitchShift(.....................): Liwei" + "round= " + round++);
c = 0;
}
gRover = inFifoLatency;
/* do windowing and re,im interleave */
for (k = 0; k < fftFrameSize; k++) {
window = -.5 * Math.cos(2.0 * Math.PI * (double) k / (double) fftFrameSize) + .5;
gFFTworksp[(int) (2 * k)] = (float) (gInFIFO[(int) k] * window);
gFFTworksp[(int) (2 * k + 1)] = 0.0F;
}
/* ***************** ANALYSIS ******************* */
/* do transform */
ShortTimeFourierTransform(gFFTworksp, fftFrameSize, -1);
/* this is the analysis step */
for (k = 0; k <= fftFrameSize2; k++) {
/* de-interlace FFT buffer */
real = gFFTworksp[(int) (2 * k)];
imag = gFFTworksp[(int) (2 * k + 1)];
/* compute magnitude and phase */
magn = 2.0 * Math.sqrt(real * real + imag * imag);
phase = MathUtilities.atan2(imag, real);
/* compute phase difference */
tmp = phase - gLastPhase[(int) k];
gLastPhase[(int) k] = (float) phase;
/* subtract expected phase difference */
tmp -= (double) k * expct;
/* map delta phase into +/- Pi interval */
qpd = (long) (tmp / Math.PI);
if (qpd >= 0)
qpd += qpd & 1;
else
qpd -= qpd & 1;
tmp -= Math.PI * (double) qpd;
/* get deviation from bin frequency from the +/- Pi interval */
tmp = osamp * tmp / (2.0 * Math.PI);
/* compute the k-th partials' true frequency */
tmp = (double) k * freqPerBin + tmp * freqPerBin;
/* store magnitude and true frequency in analysis arrays */
gAnaMagn[(int) k] = (float) magn;
gAnaFreq[(int) k] = (float) tmp;
}
/* ***************** PROCESSING ******************* */
/* this does the actual pitch shifting */
for (int zero = 0; zero < fftFrameSize; zero++) {
gSynMagn[zero] = 0;
gSynFreq[zero] = 0;
}
for (k = 0; k <= fftFrameSize2; k++) {
index = (long) (k * pitchShift);
if (index <= fftFrameSize2) {
gSynMagn[(int) index] += gAnaMagn[(int) k];
gSynFreq[(int) index] = gAnaFreq[(int) k] * pitchShift;
}
}
/* ***************** SYNTHESIS ******************* */
/* this is the synthesis step */
for (k = 0; k <= fftFrameSize2; k++) {
/* get magnitude and true frequency from synthesis arrays */
magn = gSynMagn[(int) k];
tmp = gSynFreq[(int) k];
/* subtract bin mid frequency */
tmp -= (double) k * freqPerBin;
/* get bin deviation from freq deviation */
tmp /= freqPerBin;
/* take osamp into account */
tmp = 2.0 * Math.PI * tmp / osamp;
/* add the overlap phase advance back in */
tmp += (double) k * expct;
/* accumulate delta phase to get bin phase */
gSumPhase[(int) k] += (float) tmp;
phase = gSumPhase[(int) k];
/* get real and imag part and re-interleave */
gFFTworksp[(int) (2 * k)] = (float) (magn * Math.cos(phase));
gFFTworksp[(int) (2 * k + 1)] = (float) (magn * Math.sin(phase));
}
/* zero negative frequencies */
for (k = fftFrameSize + 2; k < 2 * fftFrameSize; k++)
gFFTworksp[(int) k] = 0.0F;
/* do inverse transform */
ShortTimeFourierTransform(gFFTworksp, fftFrameSize, 1);
/* do windowing and add to output accumulator */
for (k = 0; k < fftFrameSize; k++) {
window = -.5 * Math.cos(2.0 * Math.PI * (double) k / (double) fftFrameSize) + .5;
gOutputAccum[(int) k] += (float) (2.0 * window * gFFTworksp[(int) (2 * k)] / (fftFrameSize2 * osamp));
}
for (k = 0; k < stepSize; k++)
gOutFIFO[(int) k] = gOutputAccum[(int) k];
/* shift accumulator */
// memmove(gOutputAccum, gOutputAccum + stepSize, fftFrameSize *
// sizeof(float));
for (k = 0; k < fftFrameSize; k++) {
gOutputAccum[(int) k] = gOutputAccum[(int) (k + stepSize)];
}
/* move input FIFO */
for (k = 0; k < inFifoLatency; k++)
gInFIFO[(int) k] = gInFIFO[(int) (k + stepSize)];
}
}
}
public static void ShortTimeFourierTransform(float[] fftBuffer, long fftFrameSize, long sign) {
float wr, wi, arg, temp;
float tr, ti, ur, ui;
long i, bitm, j, le, le2, k;
for (i = 2; i < 2 * fftFrameSize - 2; i += 2) {
for (bitm = 2, j = 0; bitm < 2 * fftFrameSize; bitm <<= 1) {
if ((i & bitm) != 0)
j++;
j <<= 1;
}
if (i < j) {
temp = fftBuffer[(int) i];
fftBuffer[(int) i] = fftBuffer[(int) j];
fftBuffer[(int) j] = temp;
temp = fftBuffer[(int) (i + 1)];
fftBuffer[(int) (i + 1)] = fftBuffer[(int) (j + 1)];
fftBuffer[(int) (j + 1)] = temp;
// temp = fftBuffer[i];
// fftBuffer[i] = fftBuffer[j];
// fftBuffer[j] = temp;
// temp = fftBuffer[i + 1];
// fftBuffer[i + 1] = fftBuffer[j + 1];
// fftBuffer[j + 1] = temp;
}
long max = (long) (MathUtilities.log(fftFrameSize) / MathUtilities.log(2.0) + .5);
for (k = 0, le = 2; k < max; k++) {
le <<= 1;
le2 = le >> 1;
ur = 1.0F;
ui = 0.0F;
arg = (float) Math.PI / (le2 >> 1);
wr = (float) Math.cos(arg);
wi = (float) (sign * Math.sin(arg));
for (j = 0; j < le2; j += 2) {
for (i = j; i < 2 * fftFrameSize; i += le) {
tr = fftBuffer[(int) (i + le2)] * ur - fftBuffer[(int) (i + le2 + 1)] * ui;
ti = fftBuffer[(int) (i + le2)] * ui + fftBuffer[(int) (i + le2 + 1)] * ur;
fftBuffer[(int) (i + le2)] = fftBuffer[(int) i] - tr;
fftBuffer[(int) (i + le2 + 1)] = fftBuffer[(int) (i + 1)] - ti;
fftBuffer[(int) i] += tr;
fftBuffer[(int) (i + 1)] += ti;
// tr = fftBuffer[i + le2] * ur - fftBuffer[i + le2 + 1]
// * ui;
// ti = fftBuffer[i + le2] * ui + fftBuffer[i + le2 + 1]
// * ur;
// fftBuffer[i + le2] = fftBuffer[i] - tr;
// fftBuffer[i + le2 + 1] = fftBuffer[i + 1] - ti;
// fftBuffer[i] += tr;
// fftBuffer[i + 1] += ti;
}
tr = ur * wr - ui * wi;
ui = ur * wi + ui * wr;
ur = tr;
}
}
}
}
}
I get byte array from sound file, convert it into float array and pass to PitchShift2() and
after that I convert that float array to byte array, form a stream from byte array and pass it to the player.
but it gives an exception "Unsupported file format".
I have also taken care of Endianness while converting bytes to floats and vise-versa.
can anyone tell me how to use this class properly.

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

CUDA tiled 2D Convolution in shared memory is slower than global memory - image-processing

Related

Separable gaussian blur - optimize vertical pass

Resize image using nearest neighborhood with cuda

What is slices in OpenGL?

How to perform skin tone matching

Pitch Shifting on Blackberry

Categories

Resources