Disable sound through speaker while recording audio using AUAudioUnit - ios

I try to record audio using AUAudioUnit. I successfully get audio buffers but I also hear recorded sound through the speaker while recording. The question is how to get just buffers not passing sound to the speaker?
func startRecording() {
setupAudioSessionForRecording()
do {
let audioComponentDescription = AudioComponentDescription(
componentType: kAudioUnitType_Output,
componentSubType: kAudioUnitSubType_RemoteIO,
componentManufacturer: kAudioUnitManufacturer_Apple,
componentFlags: 0,
componentFlagsMask: 0 )
try auAudioUnit = AUAudioUnit(componentDescription: audioComponentDescription)
let audioFormat = AVAudioFormat(commonFormat: .pcmFormatInt16,
sampleRate: sampleRate,
interleaved: true,
channelLayout: AVAudioChannelLayout(layoutTag: kAudioChannelLayoutTag_Mono)!)
try auAudioUnit.inputBusses[0].setFormat(audioFormat)
try auAudioUnit.outputBusses[1].setFormat(audioFormat)
} catch {
print(error)
}
auAudioUnit.isInputEnabled = true
auAudioUnit.outputProvider = {(actionFlags, timestamp, frameCount, inputBusNumber, inputData) -> AUAudioUnitStatus in
let err : OSStatus = self.auAudioUnit.renderBlock(actionFlags,
timestamp,
frameCount,
1,
inputData,
.none)
if err == noErr {
self.processMicrophoneBuffer(inputDataList: inputData,
frameCount: UInt32(frameCount) )
} else {
print(err)
}
return err
}
do {
try auAudioUnit.allocateRenderResources()
try auAudioUnit.startHardware()
} catch {
print(error)
}
}
SOLUTION:
The solution was found here: https://gist.github.com/leonid-s-usov/dcd674b0a8baf96123cac6c4e08e3e0c
The idea is to call render block inside inputHandler instead of outputProvider
auAudioUnit.inputHandler = { (actionFlags, timestamp, frameCount, inputBusNumber) in
var bufferList = AudioBufferList(mNumberBuffers: 1,
mBuffers: AudioBuffer(
mNumberChannels: audioFormat!.channelCount,
mDataByteSize: 0,
mData: nil))
let err: OSStatus = block(actionFlags,
timestamp,
frameCount,
inputBusNumber,
&bufferList,
.none)
if err == noErr {
self.processMicrophoneBuffer(inputDataList: inputData,
frameCount: UInt32(frameCount) )
} else {
print(err)
}
}

One way to silence RemoteIO output is to zero the contents (frameCount samples) of the audio buffers in your recorded input data after you process (copy) each buffer.

Related

How to run wake word detection with pocket sphinx on iOS?

I try to run the wake word detection from pocket sphinx on iOS. As base I used TLSphinx and the speech to text works (not good STT, but it recognizes words).
I extended the decoder.swift by a new function:
public func detectWakeWord (complete: #escaping (Bool?) -> ()) throws {
ps_set_keyphrase(psDecoder, "keyphrase_search", "ZWEI")
ps_set_search(psDecoder, "keyphrase_search")
do {
if #available(iOS 10.0, *) {
try AVAudioSession.sharedInstance().setCategory(.playAndRecord, mode: .voiceChat, options: [])
} else {
try AVAudioSession.sharedInstance().setCategory(.playAndRecord)
}
} catch let error as NSError {
print("Error setting the shared AVAudioSession: \(error)")
throw DecodeErrors.CantSetAudioSession(error)
}
engine = AVAudioEngine()
let input = engine.inputNode
let mixer = AVAudioMixerNode()
let output = engine.outputNode
engine.attach(mixer)
engine.connect(input, to: mixer, format: input.outputFormat(forBus: 0))
engine.connect(mixer, to: output, format: input.outputFormat(forBus: 0))
// We forceunwrap this because the docs for AVAudioFormat specify that this constructor return nil when the channels
// are greater than 2.
let formatIn = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: 44100, channels: 1, interleaved: false)!
let formatOut = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 16000, channels: 1, interleaved: false)!
guard let bufferMapper = AVAudioConverter(from: formatIn, to: formatOut) else {
// Returns nil if the format conversion is not possible.
throw DecodeErrors.CantConvertAudioFormat
}
mixer.installTap(onBus: 0, bufferSize: 2048, format: formatIn, block: {
[unowned self] (buffer: AVAudioPCMBuffer!, time: AVAudioTime!) in
guard let sphinxBuffer = AVAudioPCMBuffer(pcmFormat: formatOut, frameCapacity: buffer.frameCapacity) else {
// Returns nil in the following cases:
// - if the format has zero bytes per frame (format.streamDescription->mBytesPerFrame == 0)
// - if the buffer byte capacity (frameCapacity * format.streamDescription->mBytesPerFrame)
// cannot be represented by an uint32_t
print("Can't create PCM buffer")
return
}
// This is needed because the 'frameLenght' default value is 0 (since iOS 10) and cause the 'convert' call
// to faile with an error (Error Domain=NSOSStatusErrorDomain Code=-50 "(null)")
// More here: http://stackoverflow.com/questions/39714244/avaudioconverter-is-broken-in-ios-10
sphinxBuffer.frameLength = sphinxBuffer.frameCapacity
var error : NSError?
let inputBlock : AVAudioConverterInputBlock = {
inNumPackets, outStatus in
outStatus.pointee = AVAudioConverterInputStatus.haveData
return buffer
}
bufferMapper.convert(to: sphinxBuffer, error: &error, withInputFrom: inputBlock)
print("Error? ", error as Any);
let audioData = sphinxBuffer.toData()
self.process_raw(audioData)
print("Process: \(buffer.frameLength) frames - \(audioData.count) bytes - sample time: \(time.sampleTime)")
self.end_utt()
let hypothesis = self.get_hyp()
print("HYPOTHESIS: ", hypothesis)
DispatchQueue.main.async {
complete(hypothesis != nil)
}
self.start_utt()
})
start_utt()
do {
try engine.start()
} catch let error as NSError {
end_utt()
print("Can't start AVAudioEngine: \(error)")
throw DecodeErrors.CantStartAudioEngine(error)
}
}
There are not errors, but hypothesis is always nil.
My dictionary maps everything to "ZWEI", so the wake word should be detected, if anything is detected.
ZWEI AH P Z EH TS B AAH EX
ZWEI(2) HH IH T
ZWEI(3) F EH EX Q OE F EH N T L IH CC T
ZWEI(4) G AX V AH EX T AX T
...
ZWEI(12113) N AY NZWO B IIH T AX N
Does someone know why hypothesis is always nil?
I had to run self.get_hyp() before self.end_utt().
I'm not sure why, but it is different from speech to text calling order.
Edit
Another tip: For better wake word detection quality increase the buffer size for the microphone input. E.g.:
mixer.installTap(onBus: 0, bufferSize: 8192, format: formatIn, block: [...]

Stream audio with Swift

I'm developing an application that should record a user's voice and stream it to a custom device via the MQTT protocol.
The audio specification for the custom device: little-endian, unsigned, 16-bit LPCM at 8khz sample rate. Packets should be 1000 bytes each.
I'm not familiar with AudioEngine and I found this sample of code which I believe fits my case:
func startRecord() {
audioEngine = AVAudioEngine()
let bus = 0
let inputNode = audioEngine.inputNode
let inputFormat = inputNode.outputFormat(forBus: bus)
var streamDescription = AudioStreamBasicDescription()
streamDescription.mFormatID = kAudioFormatLinearPCM.littleEndian
streamDescription.mSampleRate = 8000.0
streamDescription.mChannelsPerFrame = 1
streamDescription.mBitsPerChannel = 16
streamDescription.mBytesPerPacket = 1000
let outputFormat = AVAudioFormat(streamDescription: &streamDescription)!
guard let converter: AVAudioConverter = AVAudioConverter(from: inputFormat, to: outputFormat) else {
print("Can't convert in to this format")
return
}
inputNode.installTap(onBus: 0, bufferSize: 1024, format: inputFormat) { (buffer, time) in
print("Buffer format: \(buffer.format)")
var newBufferAvailable = true
let inputCallback: AVAudioConverterInputBlock = { inNumPackets, outStatus in
if newBufferAvailable {
outStatus.pointee = .haveData
newBufferAvailable = false
return buffer
} else {
outStatus.pointee = .noDataNow
return nil
}
}
let convertedBuffer = AVAudioPCMBuffer(pcmFormat: outputFormat, frameCapacity: AVAudioFrameCount(outputFormat.sampleRate) * buffer.frameLength / AVAudioFrameCount(buffer.format.sampleRate))!
var error: NSError?
let status = converter.convert(to: convertedBuffer, error: &error, withInputFrom: inputCallback)
assert(status != .error)
print("Converted buffer format:", convertedBuffer.format)
}
audioEngine.prepare()
do {
try audioEngine.start()
} catch {
print("Can't start the engine: \(error)")
}
}
But currently, the converter can't convert the input format to my output format and I don't understand why.
If I change my output format to something like that:
let outputFormat = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 8000.0, channels: 1, interleaved: false)!
Then it works.
Your streamDescription is wrong, you hadn't filled in all the fields, and mBytesPerPacket was wrong - this is not the same kind of packet your protocol calls for. For uncompressed audio (like LPCM) AudioStreamBasicDescription requires this field to be 1. If your protocol requires samples to be in groups of 1000, then you will have to do that.
Try this
var streamDescription = AudioStreamBasicDescription()
streamDescription.mSampleRate = 8000.0
streamDescription.mFormatID = kAudioFormatLinearPCM
streamDescription.mFormatFlags = kAudioFormatFlagIsSignedInteger // no endian flag means little endian
streamDescription.mBytesPerPacket = 2
streamDescription.mFramesPerPacket = 1
streamDescription.mBytesPerFrame = 2
streamDescription.mChannelsPerFrame = 1
streamDescription.mBitsPerChannel = 16
streamDescription.mReserved = 0

Change speed and pitch of audio in real time [AudioUnit iOS]

We are working with Audio Unit on iOS as part of a VOIP application. We have successfully played audio but now we would like to control the playing speed and pitch of audio in real time. We are getting real time audio bytes from the UDP socket.
This is the code for audio unit init for playing:
init(_ client: UDPClient, _ tcpClient: TCPClient, _ opusHelper: OpusHelper, _ tvTemp: UILabel) {
super.init()
let success = initCircularBuffer(&circularBuffer, 4096)
if success {
print("Circular buffer init was successful")
} else {
print("Circular buffer init not successful")
}
self.tvTemp = tvTemp
self.opusHelper = opusHelper
monotonicTimer = MonotonicTimer()
udpClient = client
self.tcpClient = tcpClient
var desc = AudioComponentDescription(
componentType: OSType(kAudioUnitType_Output),
componentSubType: OSType(kAudioUnitSubType_VoiceProcessingIO),
componentManufacturer: OSType(kAudioUnitManufacturer_Apple),
componentFlags: 0,
componentFlagsMask: 0
)
let inputComponent = AudioComponentFindNext(nil, &desc)
status = AudioComponentInstanceNew(inputComponent!, &audioUnit)
if status != noErr {
print("Audio component instance new error \(status!)")
}
// Enable IO for recording
var flag: UInt32 = 1
// Enable IO for playback
status = AudioUnitSetProperty(
audioUnit!,
kAudioOutputUnitProperty_EnableIO,
kAudioUnitScope_Output,
kOutputBus,
&flag,
MemoryLayoutStride.SizeOf32(flag)
)
if status != noErr {
print("Enable IO for playback error \(status!)")
}
var ioFormat = CAStreamBasicDescription(
sampleRate: 48000.0,
numChannels: 1,
pcmf: .int16,
isInterleaved: false
)
status = AudioUnitSetProperty(
audioUnit!,
AudioUnitPropertyID(kAudioUnitProperty_StreamFormat),
AudioUnitScope(kAudioUnitScope_Input),
0,
&ioFormat!,
MemoryLayoutStride.SizeOf32(ioFormat)
)
if status != noErr {
print("Unable to set stream format input to output \(status!)")
}
var playbackCallback = AURenderCallbackStruct(
inputProc: AudioController_PlaybackCallback,
inputProcRefCon: UnsafeMutableRawPointer(Unmanaged.passUnretained(self).toOpaque())
)
status = AudioUnitSetProperty(
audioUnit!,
AudioUnitPropertyID(kAudioUnitProperty_SetRenderCallback),
AudioUnitScope(kAudioUnitScope_Input),
kOutputBus,
&playbackCallback,
MemoryLayout<AURenderCallbackStruct>.size.ui
)
if status != noErr {
print("Failed to set recording render callback \(status!)")
}
status = AudioUnitInitialize(audioUnit!)
if status != noErr {
print("Failed to initialize audio unit \(status!)")
}
}
We are putting audio data from UDP in TPCircular Buffer:
let decodedData = self.opusHelper?.decodeStream(of: self.jitterGet.buffer)
let _ = TPCircularBufferProduceBytes(&self.circularBuffer, decodedData, UInt32(decodedData!.count * 2))
This is how we are are playing the audio.
func performPlayback(
_ ioActionFlags: UnsafeMutablePointer<AudioUnitRenderActionFlags>,
inTimeStamp: UnsafePointer<AudioTimeStamp>,
inBufNumber: UInt32,
inNumberFrames: UInt32,
ioData: UnsafeMutablePointer<AudioBufferList>
) -> OSStatus {
let buffer = ioData[0].mBuffers
let bytesToCopy = ioData[0].mBuffers.mDataByteSize
var bufferTail: UnsafeMutableRawPointer?
// print("BYTES TO COPY: \(bytesToCopy)")
self.availableBytes = 0
bufferTail = TPCircularBufferTail(&self.circularBuffer, &self.availableBytes)
bytesToWrite = min(bytesToCopy, self.availableBytes)
print("BYTES TO WRITE: \(bytesToWrite)")
if bytesToWrite >= 3840 {
memcpy(buffer.mData, bufferTail, Int(bytesToWrite))
TPCircularBufferConsume(&self.circularBuffer, bytesToWrite)
} else {
let silence = [Int16](repeating: 0, count: Int(bytesToCopy))
memcpy(buffer.mData, silence, Int(bytesToCopy))
}
return noErr
}
Now we want to change the SPEED and PITCH of the playing audio, can you please guide us on how to integrate Varispeed and Timepitch in our current configuration, as we found out that these properties may help us.
https://stackoverflow.com/a/59061396/12020007 #hotpaw2
Your answer pointed us to the right path. Now we are looking to change speed and pitch.

is there anyway to limit AvAudioEngine's buffer size?

I cant find anywhere how can i limit avaudioengine or mixer nodes output buffer? i found this from raywenderlich tutorial site but they say buffer size is not guaranteed
"installTap(onBus: 0, bufferSize: 1024, format: format) gives you >access to the audio data on the mainMixerNode‘s output bus. You >request a buffer size of 1024 bytes, but the requested size isn’t >guaranteed, especially if you request a buffer that’s too small or >large. Apple’s documentation doesn’t specify what those limits are."
https://www.raywenderlich.com/5154-avaudioengine-tutorial-for-ios-getting-started
i already tried installTap and SetCurrentIOBufferFrameSize(OSstatus) methods but all of it not working on buffer limitation.
func SetCurrentIOBufferFrameSize(inAUHAL: AudioUnit,inIOBufferFrameSize: UInt32) -> OSStatus {
var inIOBufferFrameSize = inIOBufferFrameSize
var propSize = UInt32(MemoryLayout<UInt32>.size)
return AudioUnitSetProperty(inAUHAL, AudioUnitPropertyID(kAudioUnitProperty_ScheduledFileBufferSizeFrames), kAudioUnitScope_Global, 0, &inIOBufferFrameSize, propSize)
}
func initalizeEngine() {
sampleRateConversionRatio = Float(44100 / SampleRate)
engine = AVAudioEngine()
SetCurrentIOBufferFrameSize(inAUHAL: engine.outputNode.audioUnit!, inIOBufferFrameSize: 15)
do {
try AVAudioSession.sharedInstance().setCategory(.playAndRecord , mode: .default , options: .defaultToSpeaker)
try AVAudioSession.sharedInstance().setPreferredIOBufferDuration(ioBufferDuration)
try AVAudioSession.sharedInstance().setPreferredSampleRate(Double(SampleRate))
try AVAudioSession.sharedInstance().setPreferredInputNumberOfChannels(channelCount)
} catch {
assertionFailure("AVAudioSession setup error: \(error)")
}
}
func startRecording() {
downMixer.installTap(onBus: 0, bufferSize: bufferSize, format: format) { buffer, when in
self.serialQueue.async {
let pcmBuffer = AVAudioPCMBuffer(pcmFormat: self.format16KHzMono, frameCapacity: AVAudioFrameCount(Float(buffer.frameCapacity)/self.sampleRateConversionRatio))
var error: NSError? = nil
let inputBlock: AVAudioConverterInputBlock = {inNumPackets, outStatus in
outStatus.pointee = AVAudioConverterInputStatus.haveData
return buffer
}
self.formatConverter.convert(to: pcmBuffer!, error: &error, withInputFrom: inputBlock)
if error != nil {
print(error!.localizedDescription)
}
else if let channelData = pcmBuffer!.int16ChannelData {
let channelDataPointer = channelData.pointee
let channelData = stride(from: 0,
to: Int(pcmBuffer!.frameLength),
by: buffer.stride).map{ channelDataPointer[$0] }
//Return channelDataValueArray
let data = Data(fromArray: channelData)
var byteArray = data.toByteArray()
}
}
}
}

Connecting AVAudioMixerNode to AVAudioEngine

I use AVAudioMixerNode to change audio format. this entry helped me a lot. Below code gives me data i want. But i hear my own voice on phone's speaker. How can i prevent it?
func startAudioEngine()
{
engine = AVAudioEngine()
guard let engine = engine, let input = engine.inputNode else {
// #TODO: error out
return
}
let downMixer = AVAudioMixerNode()
//I think you the engine's I/O nodes are already attached to itself by default, so we attach only the downMixer here:
engine.attach(downMixer)
//You can tap the downMixer to intercept the audio and do something with it:
downMixer.installTap(onBus: 0, bufferSize: 2048, format: downMixer.outputFormat(forBus: 0), block: //originally 1024
{ (buffer: AVAudioPCMBuffer!, time: AVAudioTime!) -> Void in
//i get audio data here
}
)
//let's get the input audio format right as it is
let format = input.inputFormat(forBus: 0)
//I initialize a 16KHz format I need:
let format16KHzMono = AVAudioFormat.init(commonFormat: AVAudioCommonFormat.pcmFormatInt16, sampleRate: 11025.0, channels: 1, interleaved: true)
//connect the nodes inside the engine:
//INPUT NODE --format-> downMixer --16Kformat--> mainMixer
//as you can see I m downsampling the default 44khz we get in the input to the 16Khz I want
engine.connect(input, to: downMixer, format: format)//use default input format
engine.connect(downMixer, to: engine.outputNode, format: format16KHzMono)//use new audio format
engine.prepare()
do {
try engine.start()
} catch {
// #TODO: error out
}
}
You can hear your microphone recording through your speakers because your microphone is connected to downMixer, which is connected to engine.outputNode. You could probably just mute the output for the downMixer if you aren't using it with other inputs:
downMixer.outputVolume = 0.0
I did it like this to change the frequency to 48000Hz / 16 bit per sample / 2 channels, and save it to wave file:
let outputAudioFileFormat = [AVFormatIDKey: Int(kAudioFormatLinearPCM), AVSampleRateKey: 48000, AVNumberOfChannelsKey: 2, AVEncoderAudioQualityKey: AVAudioQuality.high.rawValue]
let audioRecordingFormat : AVAudioFormat = AVAudioFormat.init(commonFormat: AVAudioCommonFormat.pcmFormatInt16, sampleRate: 48000, channels: 2, interleaved: true)!
do{
try file = AVAudioFile(forWriting: url, settings: outputAudioFileFormat, commonFormat: .pcmFormatInt16, interleaved: true)
let recordingSession = AVAudioSession.sharedInstance()
try recordingSession.setPreferredInput(input)
try recordingSession.setPreferredSampleRate(audioRecordingFormat.sampleRate)
engine.inputNode.installTap(onBus: 0, bufferSize: 1024, format: audioRecordingFormat, block: self.bufferAvailable)
engine.connect(engine.inputNode, to: engine.outputNode, format: audioRecordingFormat) //configure graph
}
catch
{
debugPrint("Could not initialize the audio file: \(error)")
}
And the function block
func bufferAvailable(buffer: AVAudioPCMBuffer, time: AVAudioTime)
{
do
{
try self.file?.write(from: buffer)
if self.onBufferAvailable != nil {
DispatchQueue.main.async {
self.onBufferAvailable!(buffer) // outside function used for analyzing and displaying a wave meter
}
}
}
catch{
self.stopEngine()
DispatchQueue.main.async {
self.onRecordEnd(false)
}
}
}
The stopEngine function is this, you should call it also when you want to stop the recording:
private func stopEngine()
{
self.engine.inputNode.removeTap(onBus: 0)
self.engine.stop()
}

Resources