UPDATE 2016-03-15
Please take a look at this project: https://github.com/ooper-shlab/aurioTouch2.0-Swift. It has been ported to Swift and contains every answer you're looking for, if you cam here.
I did a lot of research and learned a lot about FFT and the Accelerate Framework. But after days of experiments I'm kind of frustrated.
I want to display the frequency spectrum of an audio file during playback in a diagram. For every time interval it should show the magnitude in db on the Y-axis (displayed by a red bar) for every frequency (in my case 512 values) calculated by a FFT on the X-Axis.
The output should look like this:
I fill a buffer with 1024 samples extracting only the left channel for the beginning. Then I do all this FFT stuff.
Here is my code so far:
Setting up some variables
- (void)setupVars
{
maxSamples = 1024;
log2n = log2f(maxSamples);
n = 1 << log2n;
stride = 1;
nOver2 = maxSamples/2;
A.realp = (float *) malloc(nOver2 * sizeof(float));
A.imagp = (float *) malloc(nOver2 * sizeof(float));
memset(A.imagp, 0, nOver2 * sizeof(float));
obtainedReal = (float *) malloc(n * sizeof(float));
originalReal = (float *) malloc(n * sizeof(float));
setupReal = vDSP_create_fftsetup(log2n, FFT_RADIX2);
}
Doing the FFT. FrequencyArray is just a data structure that holds 512 float values.
- (FrequencyArry)performFastFourierTransformForSampleData:(SInt16*)sampleData andSampleRate:(UInt16)sampleRate
{
NSLog(#"log2n %i n %i, nOver2 %i", log2n, n, nOver2);
// n = 1024
// log2n 10
// nOver2 = 512
for (int i = 0; i < n; i++) {
originalReal[i] = (float) sampleData[i];
}
vDSP_ctoz((COMPLEX *) originalReal, 2, &A, 1, nOver2);
vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_FORWARD);
float scale = (float) 1.0 / (2 * n);
vDSP_vsmul(A.realp, 1, &scale, A.realp, 1, nOver2);
vDSP_vsmul(A.imagp, 1, &scale, A.imagp, 1, nOver2);
vDSP_ztoc(&A, 1, (COMPLEX *) obtainedReal, 2, nOver2);
FrequencyArry frequencyArray;
for (int i = 0; i < nOver2; i++) {
frequencyArray.frequency[i] = log10f(obtainedReal[i]); // Magnitude in db???
}
return frequencyArray;
}
The output looks always kind of weird although it some how seems to move according to the music.
I'm happy that I came so far thanks to some very good posts here like this:
Using the apple FFT and accelerate Framework
But now I don't know what to do. What am I missing?
Firstly you're not applying a window function prior to the FFT - this will result in smearing of the spectrum due to spectral leakage.
Secondly, you're just using the real component of the FFT output bins to calculate dB magnitude - you need to use the complex magnitude:
magnitude_dB = 10 * log10(re * re + im * im);
Related
I am trying to make a colored waveform using the output of the following code. But when I run it, I only get certain numbers (see the freq variable, it uses the bin size, frame rate and index to make these frequencies) as output frequencies. I'm no math expert, even though I cobbled this together from existing code and answers.
//
// colored_waveform.c
// MixDJ
//
// Created by Jonathan Silverman on 3/14/19.
// Copyright © 2019 Jonathan Silverman. All rights reserved.
//
#include "colored_waveform.h"
#include "fftw3.h"
#include <math.h>
#include "sndfile.h"
//int N = 1024;
// helper function to apply a windowing function to a frame of samples
void calcWindow(double* in, double* out, int size) {
for (int i = 0; i < size; i++) {
double multiplier = 0.5 * (1 - cos(2*M_PI*i/(size - 1)));
out[i] = multiplier * in[i];
}
}
// helper function to compute FFT
void fft(double* samples, fftw_complex* out, int size) {
fftw_plan p;
p = fftw_plan_dft_r2c_1d(size, samples, out, FFTW_ESTIMATE);
fftw_execute(p);
fftw_destroy_plan(p);
}
// find the index of array element with the highest absolute value
// probably want to take some kind of moving average of buf[i]^2
// and return the maximum found
double maxFreqIndex(fftw_complex* buf, int size, float fS) {
double max_freq = 0;
double last_magnitude = 0;
for(int i = 0; i < (size / 2) - 1; i++) {
double freq = i * fS / size;
// printf("freq: %f\n", freq);
double magnitude = sqrt(buf[i][0]*buf[i][0] + buf[i][1]*buf[i][1]);
if(magnitude > last_magnitude)
max_freq = freq;
last_magnitude = magnitude;
}
return max_freq;
}
//
//// map a frequency to a color, red = lower freq -> violet = high freq
//int freqToColor(int i) {
//
//}
void generateWaveformColors(const char path[]) {
printf("Generating waveform colors\n");
SNDFILE *infile = NULL;
SF_INFO sfinfo;
infile = sf_open(path, SFM_READ, &sfinfo);
sf_count_t numSamples = sfinfo.frames;
// sample rate
float fS = 44100;
// float songLengLengthSeconds = numSamples / fS;
// printf("seconds: %f", songLengLengthSeconds);
// size of frame for analysis, you may want to play with this
float frameMsec = 5;
// samples in a frame
int frameSamples = (int)(fS / (frameMsec * 1000));
// how much overlap each frame, you may want to play with this one too
int frameOverlap = (frameSamples / 2);
// color to use for each frame
// int outColors[(numSamples / frameOverlap) + 1];
// scratch buffers
double* tmpWindow;
fftw_complex* tmpFFT;
tmpWindow = (double*) fftw_malloc(sizeof(double) * frameSamples);
tmpFFT = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * frameSamples);
printf("Processing waveform for colors\n");
for (int i = 0, outptr = 0; i < numSamples; i += frameOverlap, outptr++)
{
double inSamples[frameSamples];
sf_read_double(infile, inSamples, frameSamples);
// window another frame for FFT
calcWindow(inSamples, tmpWindow, frameSamples);
// compute the FFT on the next frame
fft(tmpWindow, tmpFFT, frameSamples);
// which frequency is the highest?
double freqIndex = maxFreqIndex(tmpFFT, frameSamples, fS);
printf("%i: ", i);
printf("Max freq: %f\n", freqIndex);
// map to color
// outColors[outptr] = freqToColor(freqIndex);
}
printf("Done.");
sf_close (infile);
}
Here is some of the output:
2094216: Max freq: 5512.500000
2094220: Max freq: 0.000000
2094224: Max freq: 0.000000
2094228: Max freq: 0.000000
2094232: Max freq: 5512.500000
2094236: Max freq: 5512.500000
It only shows certain numbers, not a wide variety of frequencies like it maybe should. Or am I wrong? Is there anything wrong with my code you guys can see? The color stuff is commented out because I haven't done it yet.
The frequency resolution of an FFT is limited by the length of the data sample you have. The more samples you have, the higher the frequency resolution.
In your specific case you chose frames of 5 milliseconds, which is then transformed to a number of samples on the following line:
// samples in a frame
int frameSamples = (int)(fS / (frameMsec * 1000));
This corresponds to only 8 samples at the specified 44100Hz sampling rate. The frequency resolution with such a small frame size can be computed to be
44100 / 8
or 5512.5Hz, a rather poor resolution. Correspondingly, the observed frequencies will always be one of 0, 5512.5, 11025, 16537.5 or 22050Hz.
To get a higher resolution you should increase the number of samples used for analysis by increasing frameMsec (as suggested by the comment "size of frame for analysis, you may want to play with this").
I'm trying to do a realtime FFT on microphone data in iOS using Novacaine and I cant seem to pass the FFT setup and ring buffer to my FFT function.
For testing, my plan was to do the setup in the viewWillAppear method, then execute my FFT function when a button is pressed. I'll then deallocate the memory for my buffers and destroy the FFTsetup when the window closes.
Here's what I have so far. I've tried several variations of passing arguments to viewWillAppear but nothing seems to be working. Any suggestions are appreceated.
- (void)viewWillAppear:(BOOL)animated
{
[super viewWillAppear:animated];
ringBuffer = new RingBuffer(8192, 2);
audioManager = [Novocaine audioManager];
[audioManager setInputBlock:^(float *data, UInt32 numFrames, UInt32 numChannels) {
// Setup FFT here
int numSamples = 8192;
// Setup the length
vDSP_Length log2n = log2f(numSamples);
// Calculate the weights array. This is a one-off operation.
FFTSetup fftSetup = vDSP_create_fftsetup(log2n, FFT_RADIX2);
// For an FFT, numSamples must be a power of 2, i.e. is always even
int nOver2 = numSamples/2;
// Populate *window with the values for a hamming window function
float *window = (float *)malloc(sizeof(float) * numSamples);
vDSP_hamm_window(window, numSamples, 0);
// Window the samples
vDSP_vmul(data, 1, window, 1, data, 1, numSamples);
// Define complex buffer
COMPLEX_SPLIT A;
A.realp = (float *) malloc(nOver2*sizeof(float));
A.imagp = (float *) malloc(nOver2*sizeof(float));
}];
}
- (IBAction)buttonPressed:(id)sender {
// do myFFT here
// Pack samples:
// C(re) -> A[n], C(im) -> A[n+1]
vDSP_ctoz((COMPLEX*)data, 2, &A, 1, numSamples/2);
//Perform a forward FFT using fftSetup and A
//Results are returned in A
vDSP_fft_zrip(fftSetup, &A, 1, log2n, FFT_FORWARD);
//Convert COMPLEX_SPLIT A result to magnitudes
float *amp = (float *)malloc(sizeof(float) * numSamples);
amp[0] = A.realp[0]/(numSamples*2);
float max = 0;
int indexOfMax = -1;
for(int i=1; i<numSamples; i++) {
amp[i]=A.realp[i]*A.realp[i]+A.imagp[i]*A.imagp[i];
//printf("i[%ld]: %.1f %ldHz \n", (long)i, amp[i], (long)22000 * i/numSamples);
if(amp[i] > max) {
max = amp[i];
indexOfMax = i;
}
}
long fmax = ((long)indexOfMax - numSamples/2)*44100/4096;
printf("max frequency is %ld\n", fmax);
free(amp);
}
The real-time audio input block isn't for doing any heavy computation, such as an FFT. Instead the input block should just (quickly!) copy the data into a large enough ring buffer.
Later, during a displayLink timer or when the button is pressed, you can check the ring buffer to see if it has enough new data, and then do the FFT.
Your code also seems to confuse the FFT size numSamples with the (usually much smaller) actual amount of samples received by the callback, which is numFrames. You can't do the FFT until enough callbacks have been called for numFrames to sum up to be equal or greater than the FFT size.
I've been trying to get exact frequencies using the FFT in Apple's Accelerate framework, but I'm having trouble working out why my values are off the true frequency.
I have been using this article http://www.dspdimension.com/admin/pitch-shifting-using-the-ft/ as the basis for my implementation, and after really struggling to get to the point I'm at now, I am totally stumped.
So far I've got audio in -> Hanning window -> FFT -> phase calculation -> weird final output. I'd think that there will be a problem with my maths somewhere, but I'm really out of ideas by now.
The outputs are a lot lower what they should be, e.g., I input 440Hz and it prints out 190Hz, or I input 880Hz and it prints out 400Hz. For the most part these results are consistent, but not always, and there doesn't seem to be any common factor between anything either...
Here is my code:
enum
{
sampleRate = 44100,
osamp = 4,
samples = 4096,
range = samples * 7 / 16,
step = samples / osamp
};
NSMutableArray *fftResults;
static COMPLEX_SPLIT A;
static FFTSetup setupReal;
static uint32_t log2n, n, nOver2;
static int32_t stride;
static float expct = 2*M_PI*((double)step/(double)samples);
static float phase1[range];
static float phase2[range];
static float dPhase[range];
- (void)fftSetup
{
// Declaring integers
log2n = 12;
n = 1 << log2n;
stride = 1;
nOver2 = n / 2;
// Allocating memory for complex vectors
A.realp = (float *) malloc(nOver2 * sizeof(float));
A.imagp = (float *) malloc(nOver2 * sizeof(float));
// Allocating memory for FFT
setupReal = vDSP_create_fftsetup(log2n, FFT_RADIX2);
// Setting phase
memset(phase2, 0, range * sizeof(float));
}
// For each sample in buffer...
for (int bufferCount = 0; bufferCount < audioBufferList.mNumberBuffers; bufferCount++)
{
// Declaring samples from audio buffer list
SInt16 *samples = (SInt16*)audioBufferList.mBuffers[bufferCount].mData;
// Creating Hann window function
for (int i = 0; i < nOver2; i++)
{
double hannMultiplier = 0.5 * (1 - cos((2 * M_PI * i) / (nOver2 - 1)));
// Applying window to each sample
A.realp[i] = hannMultiplier * samples[i];
A.imagp[i] = 0;
}
// Applying FFT
vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_FORWARD);
// Detecting phase
vDSP_zvphas(&A, stride, phase1, stride, range);
// Calculating phase difference
vDSP_vsub(phase2, stride, phase1, stride, dPhase, stride, range);
// Saving phase
memcpy(phase2, phase1, range * sizeof(float));
// Extracting DSP outputs
for (size_t j = 0; j < nOver2; j++)
{
NSNumber *realNumbers = [NSNumber numberWithFloat:A.realp[j]];
NSNumber *imagNumbers = [NSNumber numberWithFloat:A.imagp[j]];
[real addObject:realNumbers];
[imag addObject:imagNumbers];
}
// Combining real and imaginary parts
[resultsCombined addObject:real];
[resultsCombined addObject:imag];
// Filling FFT output array
[fftResults addObject:resultsCombined];
}
}
int fftCount = [fftResults count];
NSLog(#"FFT Count: %d",fftCount);
// For each FFT...
for (int i = 0; i < fftCount; i++)
{
// Declaring integers for peak detection
float peak = 0;
float binNumber = 0;
// Declaring integers for phase detection
float deltaPhase;
static float trueFrequency[range];
for (size_t j = 1; j < range; j++)
{
// Calculating bin magnitiude
float realVal = [[[[fftResults objectAtIndex:i] objectAtIndex:0] objectAtIndex:j] floatValue];
float imagVal = [[[[fftResults objectAtIndex:i] objectAtIndex:1] objectAtIndex:j] floatValue];
float magnitude = sqrtf(realVal*realVal + imagVal*imagVal);
// Peak detection
if (magnitude > peak)
{
peak = magnitude;
binNumber = (float)j;
}
// Getting phase difference
deltaPhase = dPhase[j];
// Subtract expected difference
deltaPhase -= (float)j*expct;
// Map phase difference into +/- pi interval
int qpd = deltaPhase / M_PI;
if (qpd >= 0)
qpd += qpd&1;
else
qpd -= qpd&1;
deltaPhase -= M_PI * (float)qpd;
// Getting bin deviation from +/i interval
float deltaFrequency = osamp * deltaPhase / (2 * M_PI);
// Calculating true frequency at the j-th partial
trueFrequency[j] = (j * (sampleRate/samples)) + (deltaFrequency * (sampleRate/samples));
}
UInt32 mag;
mag = binNumber;
// Extracting frequency at bin peak
float f = trueFrequency[mag];
NSLog(#"True frequency = %fHz", f);
float b = roundf(binNumber*(sampleRate/nOver2));
NSLog(#" Bin frequency = %fHz", b);
}
Note that the expected phase difference (even for a bin-centered frequency) depends on both the window offset or overlap of the FFT pairs, and the bin number or frequency of the FFT result. e.g. If you offset the windows by very little (1 sample), then the unwrapped phase difference between 2 FFTs will be smaller than with a larger offset. At the same offset, if the frequency is higher, the expected phase difference between the same bin of two FFTs will be greater (or it will wrap more).
I've read these question:
Using the Apple FFT and Accelerate Framework
How do I set up a buffer when doing an FFT using the Accelerate framework?
iOS FFT Accerelate.framework draw spectrum during playback
They all describe how to setup fft with the accelerate framework. With their help I was able to setup fft and get a basic spectrum analyzer. Right now, I'm displaying all values I got from the fft. However, I only want to show 10-15, or a variable number, of bars respreseting certain frequencies. Just like the iTunes or WinAmp Level Meter.
1. Do I need to average magnitude values from a range of frequencies? Or do they just show you a magnitude for the specific frequency bar?
2. Also, do I need to convert my magnitude values to db?
3. How do I map my data to a certain range. Do I map against the max db range for my sounds bitdepth? Getting the max Value for a bin will lead to jumping max mapping values.
My RenderCallback:
static OSStatus PlaybackCallback(void *inRefCon,
AudioUnitRenderActionFlags *ioActionFlags,
const AudioTimeStamp *inTimeStamp,
UInt32 inBusNumber,
UInt32 inNumberFrames,
AudioBufferList *ioData)
{
UInt32 maxSamples = kAudioBufferNumFrames;
UInt32 log2n = log2f(maxSamples); //bins
UInt32 n = 1 << log2n;
UInt32 stride = 1;
UInt32 nOver2 = n/2;
COMPLEX_SPLIT A;
float *originalReal, *obtainedReal, *frequencyArray, *window, *in_real;
in_real = (float *) malloc(maxSamples * sizeof(float));
A.realp = (float *) malloc(nOver2 * sizeof(float));
A.imagp = (float *) malloc(nOver2 * sizeof(float));
memset(A.imagp, 0, nOver2 * sizeof(float));
obtainedReal = (float *) malloc(n * sizeof(float));
originalReal = (float *) malloc(n * sizeof(float));
frequencyArray = (float *) malloc(n * sizeof(float));
//-- window
UInt32 windowSize = maxSamples;
window = (float *) malloc(windowSize * sizeof(float));
memset(window, 0, windowSize * sizeof(float));
// vDSP_hann_window(window, windowSize, vDSP_HANN_DENORM);
vDSP_blkman_window(window, windowSize, 0);
vDSP_vmul(ioBuffer, 1, window, 1, in_real, 1, maxSamples);
//-- window
vDSP_ctoz((COMPLEX*)in_real, 2, &A, 1, maxSamples/2);
vDSP_fft_zrip(fftSetup, &A, stride, log2n, FFT_FORWARD);
vDSP_fft_zrip(fftSetup, &A, stride, log2n, FFT_INVERSE);
float scale = (float) 1.0 / (2 * n);
vDSP_vsmul(A.realp, 1, &scale, A.realp, 1, nOver2);
vDSP_vsmul(A.imagp, 1, &scale, A.imagp, 1, nOver2);
vDSP_ztoc(&A, 1, (COMPLEX *) obtainedReal, 2, nOver2);
vDSP_zvmags(&A, 1, obtainedReal, 1, nOver2);
Float32 one = 1;
vDSP_vdbcon(obtainedReal, 1, &one, obtainedReal, 1, nOver2, 0);
for (int i = 0; i < nOver2; i++) {
frequencyArray[i] = obtainedReal[i];
}
// Extract the maximum value
double fftMax = 0.0;
vDSP_maxmgvD((double *)obtainedReal, 1, &fftMax, nOver2);
float max = sqrt(fftMax);
}
Playing some music, I get values from -96db to 0db.
Plotting a point at:
CGPointMake(i, kMaxSpectrumHeight * (1 - frequencyArray[i]/-96.));
is giving my a rather rounded curve:
plot1
If I don't convert to db I can plot by multiplying my array value by 10000 and get nice peaks.
plot2
Am I doing something totally wrong? And how do I get to showing a variable number of bars?
Do I need to average magnitude values from a range of frequencies? Or do they just show you a magnitude for the specific frequency bar?
Yes, you definitely need to average values across the bands you've defined. Showing just one FFT bin is madness.
Also, do I need to convert my magnitude values to db?
Yes: dB is a log scale. Not coincidentally, human hearing also works (roughly) on a log scale. The values will therefore look more natural to humans if you take log2() of the values before plotting them.
How do I map my data to a certain range. Do I map against the max db range for my sounds bitdepth? Getting the max Value for a bin will
lead to jumping max mapping values.
I find the easiest thing to do (conceptually at least) is to convert your values from whatever format into a 0..1, i.e. 'normalised and scaled' float value. Then from there you can convert if necessary to something you need to plot. For example
SInt16 rawValue = fft[0]; // let's say this comes back as 12990
float scaledValue = rawValue/32767.; // This is MAX_INT for 16-bit;
// dividing we get .396435438 which is much easier for most people
// to see conceptually as 39% of our max possible value
float displayValue = log2(scaledValue);
my_fft[0] = displayValue;
I've been using this web page as a guideline for formant tracking of speech...
http://iitg.vlab.co.in/?sub=59&brch=164&sim=615&cnt=1
It all seems to be going pretty well, except for the last step, which is the converting of the cepstrum into a smoothed representation for simple peak picking for the formant tracking. The spectrograph looks good, and the cepstrograph (can I say that? :P) also looks good (from what I can tell), but the final stage the results (smoothed formant representation) are not what I expected.
I uploaded a sample of each stage as visual images to...
http://imgur.com/a/62duS
This sample is for the speech of the sound 'i' as in 'beed'. According to this site...
http://home.cc.umanitoba.ca/~robh/howto.html#formants
the first formant should come in around 500hz, and the second and third around 2200hz and 2800 hz respectively. The spetrograph shows something very similar, but on the last stage I am gettings results similar to...
F1 - 891
F2 - 1550
F3 - 2329
Any insight would be greatly appreciated. I've been going round in circles on this for some time. My code looks as follows...
// set up fft parameters
UInt32 log2n = 9;
UInt32 n = 512;
UInt32 window = n;
UInt32 halfN = n/2;
UInt32 stride = 1;
FFTSetup setupReal = [appDelegate getFftSetup];
int stepSize = (hpBuffer.sampleCount-window) / quantizeCount;
// calculate volume from raw samples, because it seems more reliable that fft
UInt32 volumeWindow = 128;
volumeBuffer = malloc(sizeof(float)*quantizeCount);
int windowPos = 0;
for (int i=0; i < quantizeCount; i++) {
windowPos += stepSize;
float total = 0.0f;
float max = 0.0f;
for (int p=windowPos; p < windowPos+volumeWindow; p++) {
total += sampleBuffer.buffer[p];
if (sampleBuffer.buffer[p] > max)
max = sampleBuffer.buffer[p];
}
volumeBuffer[i] = max;
}
// normalize volumebuffer
[FloatAudioBuffer normalizePositiveBuffer:volumeBuffer ofSize:quantizeCount];
// allocate memory for complex array
COMPLEX_SPLIT complexArray;
complexArray.realp = (float*)malloc(4096*sizeof(float));
complexArray.imagp = (float*)malloc(4096*sizeof(float));
// allocate some space for temporary hamming buffer
float *hamBuffer = malloc(n*sizeof(float));
// create spectrum and feature buffer
spectrumBuffer = malloc(sizeof(float)*halfN*quantizeCount);
formantBuffer = malloc(sizeof(float)*4096*quantizeCount);
cepstrumBuffer = malloc(sizeof(float)*halfN*quantizeCount);
lowCepstrumBuffer = malloc(sizeof(float)*featureCount*quantizeCount);
featureBuffer = malloc(sizeof(float)*featureCount*quantizeCount);
// create data point for each quantize segment
float TWOPI = 2.0f * M_PI;
for (int s=0; s < quantizeCount; s++) {
// copy buffer data into a seperate array and apply hamming window
int offset = (int)(s * stepSize);
for (int i=0; i < n; i++)
hamBuffer[i] = hpBuffer.buffer[offset+i] * ((1.0f-0.46f) - 0.46f*cos(TWOPI*i/((float)n-1.0f)));
// configure float array into acceptable input array format (interleaved)
vDSP_ctoz((COMPLEX*)hamBuffer, 2, &complexArray, 1, halfN);
// run FFT
vDSP_fft_zrip(setupReal, &complexArray, stride, log2n, FFT_FORWARD);
// Absolute square (equivalent to mag^2)
complexArray.imagp[0] = 0.0f;
vDSP_zvmags(&complexArray, 1, complexArray.realp, 1, halfN);
bzero(complexArray.imagp, (halfN) * sizeof(float));
// scale
float scale = 1.0f / (2.0f*(float)n);
vDSP_vsmul(complexArray.realp, 1, &scale, complexArray.realp, 1, halfN);
// get log of absolute values for passing to inverse FFT for cepstrum
for (int i=0; i < halfN; i++)
complexArray.realp[i] = logf(sqrtf(complexArray.realp[i]));
// save this into spectrum buffer
memcpy(&spectrumBuffer[s*halfN], complexArray.realp, halfN*sizeof(float));
// convert spectrum to interleaved ready for inverse fft
vDSP_ctoz((COMPLEX*)&spectrumBuffer[s*halfN], 2, &complexArray, 1, halfN/2);
// create cepstrum
vDSP_fft_zrip(setupReal, &complexArray, stride, log2n-1, FFT_INVERSE);
//convert interleaved to real and straight into cepstrum buffer
vDSP_ztoc(&complexArray, 1, (COMPLEX*)&cepstrumBuffer[s*halfN], 2, halfN/2);
// copy first part of cepstrum into low cepstrum buffer
memcpy(&lowCepstrumBuffer[s*featureCount], &cepstrumBuffer[s*halfN], featureCount*sizeof(float));
// make 8000 point array based on the first 15 values
float *tempArray = malloc(8192*sizeof(float));
for (int i=0; i < 8192; i++) {
if (i < 15)
tempArray[i] = cepstrumBuffer[s*halfN+i];
else
tempArray[i] = 0.0f;
}
vDSP_ctoz((COMPLEX*)tempArray, 2, &complexArray, 1, 4096);
float newLog2n = log2f(8192.0f);
complexArray.imagp[0] = 0.0f;
vDSP_fft_zrip(setupReal, &complexArray, stride, newLog2n, FFT_FORWARD);
vDSP_zvmags(&complexArray, 1, complexArray.realp, 1, 4096);
bzero(complexArray.imagp, (4096) * sizeof(float));
// scale
scale = 1.0f / (2.0f*(float)8192);
vDSP_vsmul(complexArray.realp, 1, &scale, complexArray.realp, 1, 4096);
// get magnitude
for (int i=0; i < 4096; i++)
complexArray.realp[i] = sqrtf(complexArray.realp[i]);
// write to formant buffer
memcpy(&formantBuffer[s*4096], complexArray.realp, 4096*sizeof(float));
// complex array now contains formant spectrum
// it's large, so get features here!
// try simple peak picking algorithm for first 3 formants
int formantIndex = 0;
float *peaks = malloc(6*sizeof(float));
for (int i=0; i < 6; i++)
peaks[i] = 0.0f;
for (int i=1; i < 4096-1 && formantIndex < 6; i++) {
if (complexArray.realp[i-1] < complexArray.realp[i] &&
complexArray.realp[i+1] < complexArray.realp[i])
peaks[formantIndex++] = i;
}