I cant find anywhere how can i limit avaudioengine or mixer nodes output buffer? i found this from raywenderlich tutorial site but they say buffer size is not guaranteed
"installTap(onBus: 0, bufferSize: 1024, format: format) gives you >access to the audio data on the mainMixerNode‘s output bus. You >request a buffer size of 1024 bytes, but the requested size isn’t >guaranteed, especially if you request a buffer that’s too small or >large. Apple’s documentation doesn’t specify what those limits are."
https://www.raywenderlich.com/5154-avaudioengine-tutorial-for-ios-getting-started
i already tried installTap and SetCurrentIOBufferFrameSize(OSstatus) methods but all of it not working on buffer limitation.
func SetCurrentIOBufferFrameSize(inAUHAL: AudioUnit,inIOBufferFrameSize: UInt32) -> OSStatus {
var inIOBufferFrameSize = inIOBufferFrameSize
var propSize = UInt32(MemoryLayout<UInt32>.size)
return AudioUnitSetProperty(inAUHAL, AudioUnitPropertyID(kAudioUnitProperty_ScheduledFileBufferSizeFrames), kAudioUnitScope_Global, 0, &inIOBufferFrameSize, propSize)
}
func initalizeEngine() {
sampleRateConversionRatio = Float(44100 / SampleRate)
engine = AVAudioEngine()
SetCurrentIOBufferFrameSize(inAUHAL: engine.outputNode.audioUnit!, inIOBufferFrameSize: 15)
do {
try AVAudioSession.sharedInstance().setCategory(.playAndRecord , mode: .default , options: .defaultToSpeaker)
try AVAudioSession.sharedInstance().setPreferredIOBufferDuration(ioBufferDuration)
try AVAudioSession.sharedInstance().setPreferredSampleRate(Double(SampleRate))
try AVAudioSession.sharedInstance().setPreferredInputNumberOfChannels(channelCount)
} catch {
assertionFailure("AVAudioSession setup error: \(error)")
}
}
func startRecording() {
downMixer.installTap(onBus: 0, bufferSize: bufferSize, format: format) { buffer, when in
self.serialQueue.async {
let pcmBuffer = AVAudioPCMBuffer(pcmFormat: self.format16KHzMono, frameCapacity: AVAudioFrameCount(Float(buffer.frameCapacity)/self.sampleRateConversionRatio))
var error: NSError? = nil
let inputBlock: AVAudioConverterInputBlock = {inNumPackets, outStatus in
outStatus.pointee = AVAudioConverterInputStatus.haveData
return buffer
}
self.formatConverter.convert(to: pcmBuffer!, error: &error, withInputFrom: inputBlock)
if error != nil {
print(error!.localizedDescription)
}
else if let channelData = pcmBuffer!.int16ChannelData {
let channelDataPointer = channelData.pointee
let channelData = stride(from: 0,
to: Int(pcmBuffer!.frameLength),
by: buffer.stride).map{ channelDataPointer[$0] }
//Return channelDataValueArray
let data = Data(fromArray: channelData)
var byteArray = data.toByteArray()
}
}
}
}
Related
I am currently using Microsoft Azure Cognitive Speech SDK to play text to speech.
I am able to get the data from the Stream which is provided in the following format (reference):
.audio16Khz32KBitRateMonoMp3
This is set like this:
private let inputFormat = AVAudioFormat(
commonFormat: .pcmFormatFloat32,
sampleRate: 16000,
channels: 1,
interleaved: false
)!
I'm using AVAudioEngine & AVAudioPlayerNode:
let engine = AVAudioEngine()
let player = AVAudioPlayerNode()
override func viewDidLoad() {
super.viewDidLoad()
let mainMixer = engine.mainMixerNode
engine.attach(player)
engine.connect(player, to: mainMixer, format: inputFormat)
try! engine.start()
}
I am able to play this back with some success using the following:
func playAudio(dialogue: String, audioPlayer: AVAudioPlayerNode, then completion: #escaping ( () -> Void)) {
audioAsset = nil
try? FileManager.default.removeItem(at: recordingPath)
FileManager.default.createFile(atPath: recordingPath.path, contents: nil, attributes: nil)
do {
let configuration = try SPXSpeechConfiguration(subscription: Microsoft.key, region: Microsoft.region)
configuration.setSpeechSynthesisOutputFormat(.audio16Khz32KBitRateMonoMp3)
let synthesizer = try SPXSpeechSynthesizer(speechConfiguration: configuration, audioConfiguration: nil)
let speechResult = try synthesizer.startSpeakingSsml(dialogue)
let stream = try SPXAudioDataStream(from: speechResult)
guard
let mutableFile = FileHandle(forWritingAtPath: recordingPath.path),
let streamData = NSMutableData(capacity:Int(bufferCapacity))
else {
fatalError()
}
while stream.read(streamData, length:bufferCapacity) > 0 {
mutableFile.write(streamData as Data)
mutableFile.seekToEndOfFile()
do {
let buffer = try readFileIntoBuffer(audioUrl: recordingPath)
audioPlayer.scheduleBuffer(buffer, at: currentBufferTime(buffer: buffer)) { [weak self] in
guard let self = self else { return }
if let audioAsset = self.audioAsset, audioPlayer.currentTime >= CMTimeGetSeconds(audioAsset.duration) {
DispatchQueue.main.async {
audioPlayer.stop()
}
completion()
}
}
audioPlayer.play()
} catch {
print("Unable To Play Azure Buffer Stream \(error)")
}
}
print("Did Complete Azure Buffer Rendering To File")
audioAsset = AVURLAsset.init(url: recordingPath, options: nil)
mutableFile.closeFile()
} catch {
print("Unable To Run Azure Vocder \(error)")
}
}
With my Buffer creation function being as follows:
func currentBufferTime(buffer: AVAudioPCMBuffer) -> AVAudioTime {
let framecount = Double(buffer.frameLength)
let samplerate = buffer.format.sampleRate
let position = TimeInterval(framecount / samplerate)
return AVAudioTime(sampleTime: AVAudioFramePosition(position), atRate: 1)
}
func readFileIntoBuffer(audioUrl: URL) throws -> AVAudioPCMBuffer {
let audioFile = try AVAudioFile(forReading: audioUrl)
let audioFileFormat = audioFile.processingFormat
let audioFileSize = UInt32(audioFile.length)
let audioBuffer = AVAudioPCMBuffer(pcmFormat: audioFileFormat, frameCapacity: audioFileSize)!
try audioFile.read(into: audioBuffer)
return audioBuffer
}
The issue is that this is not performant and the CPU is around 100% for a significant amount of time when running the function.
As such my question is what is a more optimum way of reading the data into a PCM Buffer?
I have looked at many examples and there doesn't seem to be any thing which works. For example:
func toPCMBuffer(format: AVAudioFormat, data: NSData) -> AVAudioPCMBuffer? {
let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: UInt32(data.count) / format.streamDescription.pointee.mBytesPerFrame)
guard let buffer = buffer else { return nil }
buffer.frameLength = buffer.frameCapacity
let channels = UnsafeBufferPointer(start: buffer.int32ChannelData, count: Int(buffer.format.channelCount))
data.getBytes(UnsafeMutableRawPointer(channels[0]) , length: data.count)
return buffer
}
I'm trying to use AVAudioEngine instead of AVAudioPlayer because I need to do some per-packet processing as the audio is playing, but before I can get that far, I need to convert the 16-bit 8khz mono audio data to stereo so the AVAudioEngine will play it. This is my (incomplete) attempt to do it. I'm currently stuck at how to make AVAudioConverter do the mono-to-stereo conversion. If I don't use the AVAudioConverter, the iOS runtime complains that the input format doesn't match the output format. If I do use it (as below), the runtime doesn't complain, but the audio does not play back properly (likely because i'm not doing the mono-to-stereo conversion correctly). Any assistance is appreciated!
private func loadAudioData(audioData: Data?) {
// Load audio data into player
guard let audio = audioData else {return}
do {
let inputAudioFormat = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: Double(sampleRate), channels: 1, interleaved: false)
let outputAudioFormat = self.audioEngine.mainMixerNode.outputFormat(forBus: 0)
if inputAudioFormat != nil {
let inputStreamDescription = inputAudioFormat?.streamDescription.pointee
let outputStreamDescription = outputAudioFormat.streamDescription.pointee
let count = UInt32(audio.count)
if inputStreamDescription != nil && count > 0 {
if let ibpf = inputStreamDescription?.mBytesPerFrame {
let inputFrameCapacity = count / ibpf
let outputFrameCapacity = count / outputStreamDescription.mBytesPerFrame
self.pcmInputBuffer = AVAudioPCMBuffer(pcmFormat: inputAudioFormat!, frameCapacity: inputFrameCapacity)
self.pcmOutputBuffer = AVAudioPCMBuffer(pcmFormat: outputAudioFormat, frameCapacity: outputFrameCapacity)
if let input = self.pcmInputBuffer, let output = self.pcmOutputBuffer {
self.pcmConverter = AVAudioConverter(from: inputAudioFormat!, to: outputAudioFormat)
input.frameLength = input.frameCapacity
let b = UnsafeMutableBufferPointer(start: input.int16ChannelData?[0], count: input.stride * Int(inputFrameCapacity))
let bytesCopied = audio.copyBytes(to: b)
assert(bytesCopied == count)
audioEngine.attach(playerNode)
audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: nil)
self.pcmConverter?.convert(to: output, error: nil) { packets, status in
status.pointee = .haveData
return self.pcmInputBuffer // I know this is wrong, but i'm not sure how to do it correctly
}
try audioEngine.start()
}
}
}
}
}
}
Speculative, incorrect answer
How about pcmConverter?.channelMap = [0, 0]?
Actual answer
You don't need to use the audio converter channel map, because mono to stereo AVAudioConverters seem to duplicate the mono channel by default. The main problems were that outputFrameCapacity was wrong, and you use mainMixers outputFormat before calling audioEngine.prepare() or starting the engine.
Assuming sampleRate = 8000, an amended solution looks like this:
private func loadAudioData(audioData: Data?) throws {
// Load audio data into player
guard let audio = audioData else {return}
do {
audioEngine.attach(playerNode)
audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: nil)
audioEngine.prepare() // https://stackoverflow.com/a/70392017/22147
let outputAudioFormat = self.audioEngine.mainMixerNode.outputFormat(forBus: 0)
guard let inputAudioFormat = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: Double(sampleRate), channels: 1, interleaved: false) else { return }
let inputStreamDescription = inputAudioFormat.streamDescription.pointee
let outputStreamDescription = outputAudioFormat.streamDescription.pointee
let count = UInt32(audio.count)
if count > 0 {
let ibpf = inputStreamDescription.mBytesPerFrame
let inputFrameCapacity = count / ibpf
let outputFrameCapacity = Float64(inputFrameCapacity) * outputStreamDescription.mSampleRate / inputStreamDescription.mSampleRate
self.pcmInputBuffer = AVAudioPCMBuffer(pcmFormat: inputAudioFormat, frameCapacity: inputFrameCapacity)
self.pcmOutputBuffer = AVAudioPCMBuffer(pcmFormat: outputAudioFormat, frameCapacity: AVAudioFrameCount(outputFrameCapacity))
if let input = self.pcmInputBuffer, let output = self.pcmOutputBuffer {
self.pcmConverter = AVAudioConverter(from: inputAudioFormat, to: outputAudioFormat)
input.frameLength = input.frameCapacity
let b = UnsafeMutableBufferPointer(start: input.int16ChannelData?[0], count: input.stride * Int(inputFrameCapacity))
let bytesCopied = audio.copyBytes(to: b)
assert(bytesCopied == count)
self.pcmConverter?.convert(to: output, error: nil) { packets, status in
status.pointee = .haveData
return self.pcmInputBuffer // I know this is wrong, but i'm not sure how to do it correctly
}
try audioEngine.start()
self.playerNode.scheduleBuffer(output, completionHandler: nil)
self.playerNode.play()
}
}
}
}
I try to run the wake word detection from pocket sphinx on iOS. As base I used TLSphinx and the speech to text works (not good STT, but it recognizes words).
I extended the decoder.swift by a new function:
public func detectWakeWord (complete: #escaping (Bool?) -> ()) throws {
ps_set_keyphrase(psDecoder, "keyphrase_search", "ZWEI")
ps_set_search(psDecoder, "keyphrase_search")
do {
if #available(iOS 10.0, *) {
try AVAudioSession.sharedInstance().setCategory(.playAndRecord, mode: .voiceChat, options: [])
} else {
try AVAudioSession.sharedInstance().setCategory(.playAndRecord)
}
} catch let error as NSError {
print("Error setting the shared AVAudioSession: \(error)")
throw DecodeErrors.CantSetAudioSession(error)
}
engine = AVAudioEngine()
let input = engine.inputNode
let mixer = AVAudioMixerNode()
let output = engine.outputNode
engine.attach(mixer)
engine.connect(input, to: mixer, format: input.outputFormat(forBus: 0))
engine.connect(mixer, to: output, format: input.outputFormat(forBus: 0))
// We forceunwrap this because the docs for AVAudioFormat specify that this constructor return nil when the channels
// are greater than 2.
let formatIn = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: 44100, channels: 1, interleaved: false)!
let formatOut = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 16000, channels: 1, interleaved: false)!
guard let bufferMapper = AVAudioConverter(from: formatIn, to: formatOut) else {
// Returns nil if the format conversion is not possible.
throw DecodeErrors.CantConvertAudioFormat
}
mixer.installTap(onBus: 0, bufferSize: 2048, format: formatIn, block: {
[unowned self] (buffer: AVAudioPCMBuffer!, time: AVAudioTime!) in
guard let sphinxBuffer = AVAudioPCMBuffer(pcmFormat: formatOut, frameCapacity: buffer.frameCapacity) else {
// Returns nil in the following cases:
// - if the format has zero bytes per frame (format.streamDescription->mBytesPerFrame == 0)
// - if the buffer byte capacity (frameCapacity * format.streamDescription->mBytesPerFrame)
// cannot be represented by an uint32_t
print("Can't create PCM buffer")
return
}
// This is needed because the 'frameLenght' default value is 0 (since iOS 10) and cause the 'convert' call
// to faile with an error (Error Domain=NSOSStatusErrorDomain Code=-50 "(null)")
// More here: http://stackoverflow.com/questions/39714244/avaudioconverter-is-broken-in-ios-10
sphinxBuffer.frameLength = sphinxBuffer.frameCapacity
var error : NSError?
let inputBlock : AVAudioConverterInputBlock = {
inNumPackets, outStatus in
outStatus.pointee = AVAudioConverterInputStatus.haveData
return buffer
}
bufferMapper.convert(to: sphinxBuffer, error: &error, withInputFrom: inputBlock)
print("Error? ", error as Any);
let audioData = sphinxBuffer.toData()
self.process_raw(audioData)
print("Process: \(buffer.frameLength) frames - \(audioData.count) bytes - sample time: \(time.sampleTime)")
self.end_utt()
let hypothesis = self.get_hyp()
print("HYPOTHESIS: ", hypothesis)
DispatchQueue.main.async {
complete(hypothesis != nil)
}
self.start_utt()
})
start_utt()
do {
try engine.start()
} catch let error as NSError {
end_utt()
print("Can't start AVAudioEngine: \(error)")
throw DecodeErrors.CantStartAudioEngine(error)
}
}
There are not errors, but hypothesis is always nil.
My dictionary maps everything to "ZWEI", so the wake word should be detected, if anything is detected.
ZWEI AH P Z EH TS B AAH EX
ZWEI(2) HH IH T
ZWEI(3) F EH EX Q OE F EH N T L IH CC T
ZWEI(4) G AX V AH EX T AX T
...
ZWEI(12113) N AY NZWO B IIH T AX N
Does someone know why hypothesis is always nil?
I had to run self.get_hyp() before self.end_utt().
I'm not sure why, but it is different from speech to text calling order.
Edit
Another tip: For better wake word detection quality increase the buffer size for the microphone input. E.g.:
mixer.installTap(onBus: 0, bufferSize: 8192, format: formatIn, block: [...]
I'm developing an application that should record a user's voice and stream it to a custom device via the MQTT protocol.
The audio specification for the custom device: little-endian, unsigned, 16-bit LPCM at 8khz sample rate. Packets should be 1000 bytes each.
I'm not familiar with AudioEngine and I found this sample of code which I believe fits my case:
func startRecord() {
audioEngine = AVAudioEngine()
let bus = 0
let inputNode = audioEngine.inputNode
let inputFormat = inputNode.outputFormat(forBus: bus)
var streamDescription = AudioStreamBasicDescription()
streamDescription.mFormatID = kAudioFormatLinearPCM.littleEndian
streamDescription.mSampleRate = 8000.0
streamDescription.mChannelsPerFrame = 1
streamDescription.mBitsPerChannel = 16
streamDescription.mBytesPerPacket = 1000
let outputFormat = AVAudioFormat(streamDescription: &streamDescription)!
guard let converter: AVAudioConverter = AVAudioConverter(from: inputFormat, to: outputFormat) else {
print("Can't convert in to this format")
return
}
inputNode.installTap(onBus: 0, bufferSize: 1024, format: inputFormat) { (buffer, time) in
print("Buffer format: \(buffer.format)")
var newBufferAvailable = true
let inputCallback: AVAudioConverterInputBlock = { inNumPackets, outStatus in
if newBufferAvailable {
outStatus.pointee = .haveData
newBufferAvailable = false
return buffer
} else {
outStatus.pointee = .noDataNow
return nil
}
}
let convertedBuffer = AVAudioPCMBuffer(pcmFormat: outputFormat, frameCapacity: AVAudioFrameCount(outputFormat.sampleRate) * buffer.frameLength / AVAudioFrameCount(buffer.format.sampleRate))!
var error: NSError?
let status = converter.convert(to: convertedBuffer, error: &error, withInputFrom: inputCallback)
assert(status != .error)
print("Converted buffer format:", convertedBuffer.format)
}
audioEngine.prepare()
do {
try audioEngine.start()
} catch {
print("Can't start the engine: \(error)")
}
}
But currently, the converter can't convert the input format to my output format and I don't understand why.
If I change my output format to something like that:
let outputFormat = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 8000.0, channels: 1, interleaved: false)!
Then it works.
Your streamDescription is wrong, you hadn't filled in all the fields, and mBytesPerPacket was wrong - this is not the same kind of packet your protocol calls for. For uncompressed audio (like LPCM) AudioStreamBasicDescription requires this field to be 1. If your protocol requires samples to be in groups of 1000, then you will have to do that.
Try this
var streamDescription = AudioStreamBasicDescription()
streamDescription.mSampleRate = 8000.0
streamDescription.mFormatID = kAudioFormatLinearPCM
streamDescription.mFormatFlags = kAudioFormatFlagIsSignedInteger // no endian flag means little endian
streamDescription.mBytesPerPacket = 2
streamDescription.mFramesPerPacket = 1
streamDescription.mBytesPerFrame = 2
streamDescription.mChannelsPerFrame = 1
streamDescription.mBitsPerChannel = 16
streamDescription.mReserved = 0
I'm writing a first in first out recording app that buffers up to 2.5 mins of audio using AudioQueue. I've got most of it figured out but I'm at a roadblock trying to crop audio data.
I've seen people do it with AVAssetExportSession but it seems like it wouldn't be performant to export a new track every time the AudioQueueInputCallback is called.
I'm not married to using AVAssestExportSession by any means if anyone has a better idea.
Here's where I'm doing my write and was hoping to execute the crop.
var beforeSeconds = TimeInterval() // find the current estimated duration (not reliable)
var propertySize = UInt32(MemoryLayout.size(ofValue: beforeSeconds))
var osStatus = AudioFileGetProperty(audioRecorder.recordFile!, kAudioFilePropertyEstimatedDuration, &propertySize, &beforeSeconds)
if numPackets > 0 {
AudioFileWritePackets(audioRecorder.recordFile!, // write to disk
false,
buffer.mAudioDataByteSize,
packetDescriptions,
audioRecorder.recordPacket,
&numPackets,
buffer.mAudioData)
audioRecorder.recordPacket += Int64(numPackets) // up the packet index
var afterSeconds = TimeInterval() // find the after write estimated duration (not reliable)
var propertySize = UInt32(MemoryLayout.size(ofValue: afterSeconds))
var osStatus = AudioFileGetProperty(audioRecorder.recordFile!, kAudioFilePropertyEstimatedDuration, &propertySize, &afterSeconds)
assert(osStatus == noErr, "couldn't get record time")
if afterSeconds >= 150.0 {
print("hit max buffer!")
audioRecorder.onBufferMax?(afterSeconds - beforeSeconds)
}
}
Here's where the callback is executed
func onBufferMax(_ difference: Double){
let asset = AVAsset(url: tempFilePath)
let duration = CMTimeGetSeconds(asset.duration)
guard duration >= 150.0 else { return }
guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetAppleM4A) else {
print("exporter init failed")
return }
exporter.outputURL = getDocumentsDirectory().appendingPathComponent("buffered.caf") // helper function that calls the FileManager
exporter.outputFileType = AVFileTypeAppleM4A
let startTime = CMTimeMake(Int64(difference), 1)
let endTime = CMTimeMake(Int64(WYNDRConstants.maxTimeInterval + difference), 1)
exporter.timeRange = CMTimeRangeFromTimeToTime(startTime, endTime)
exporter.exportAsynchronously(completionHandler: {
switch exporter.status {
case .failed:
print("failed to export")
case .cancelled:
print("canceled export")
default:
print("export successful")
}
})
}
A ring buffer is a useful structure for storing, either in memory or on disk, the most recent n seconds of audio. Here is a simple solution that stores the audio in memory, presented in the traditional UIViewController format.
N.B 2.5 minutes of 44.1kHz audio stored as floats requires about 26MB of RAM, which is on the heavy side for a mobile device.
import AVFoundation
class ViewController: UIViewController {
let engine = AVAudioEngine()
var requiredSamples: AVAudioFrameCount = 0
var ringBuffer: [AVAudioPCMBuffer] = []
var ringBufferSizeInSamples: AVAudioFrameCount = 0
func startRecording() {
let input = engine.inputNode!
let bus = 0
let inputFormat = input.inputFormat(forBus: bus)
requiredSamples = AVAudioFrameCount(inputFormat.sampleRate * 2.5 * 60)
input.installTap(onBus: bus, bufferSize: 512, format: inputFormat) { (buffer, time) -> Void in
self.appendAudioBuffer(buffer)
}
try! engine.start()
}
func appendAudioBuffer(_ buffer: AVAudioPCMBuffer) {
ringBuffer.append(buffer)
ringBufferSizeInSamples += buffer.frameLength
// throw away old buffers if ring buffer gets too large
if let firstBuffer = ringBuffer.first {
if ringBufferSizeInSamples - firstBuffer.frameLength >= requiredSamples {
ringBuffer.remove(at: 0)
ringBufferSizeInSamples -= firstBuffer.frameLength
}
}
}
func stopRecording() {
engine.stop()
let url = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first!.appendingPathComponent("foo.m4a")
let settings: [String : Any] = [AVFormatIDKey: Int(kAudioFormatMPEG4AAC)]
// write ring buffer to file.
let file = try! AVAudioFile(forWriting: url, settings: settings)
for buffer in ringBuffer {
try! file.write(from: buffer)
}
}
override func viewDidLoad() {
super.viewDidLoad()
// example usage
startRecording()
DispatchQueue.main.asyncAfter(deadline: .now() + 4*60) {
print("stopping")
self.stopRecording()
}
}
}