Can we use SFSpeechRecognizer with call Kit - ios

We are creating an online book reading app in which we are initiating video call (group call:- for video call. we are using agora SDK) and at the join of call we start book reading and highlight words at other members' end also and recording/recognition text we are using SFSpeechRecognizer but whenever call kit start and video call start SFSpeechRecognizer start to record audio at others end it's getting failed always, can you please provide any solution to record audio during the video call.
//
// Speech.swift
// Edsoma
//
// Created by Kapil on 16/02/22.
//
import Foundation
import AVFoundation
import Speech
protocol SpeechRecognizerDelegate {
func didSpoke(speechRecognizer : SpeechRecognizer , word : String?)
}
class SpeechRecognizer: NSObject {
private let speechRecognizer = SFSpeechRecognizer(locale: Locale.init(identifier: "en-US")) //1
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private let audioEngine = AVAudioEngine()
var delegate : SpeechRecognizerDelegate?
static let shared = SpeechRecognizer()
var isOn = false
func setup(){
speechRecognizer?.delegate = self //3
SFSpeechRecognizer.requestAuthorization { (authStatus) in //4
var isButtonEnabled = false
switch authStatus { //5
case .authorized:
isButtonEnabled = true
case .denied:
isButtonEnabled = false
print("User denied access to speech recognition")
case .restricted:
isButtonEnabled = false
print("Speech recognition restricted on this device" )
case .notDetermined:
isButtonEnabled = false
print("Speech recognition not yet authorized")
#unknown default:
break;
}
OperationQueue.main.addOperation() {
// self.microphoneButton.isEnabled = isButtonEnabled
}
}
}
func transcribeAudio(url: URL) {
// create a new recognizer and point it at our audio
let recognizer = SFSpeechRecognizer()
let request = SFSpeechURLRecognitionRequest(url: url)
// start recognition!
recognizer?.recognitionTask(with: request) { [unowned self] (result, error) in
// abort if we didn't get any transcription back
guard let result = result else {
print("There was an error: \(error!)")
return
}
// if we got the final transcription back, print it
if result.isFinal {
// pull out the best transcription...
print(result.bestTranscription.formattedString)
}
}
}
func startRecording() {
isOn = true
let inputNode = audioEngine.inputNode
if recognitionTask != nil {
inputNode.removeTap(onBus: 0)
self.audioEngine.stop()
self.recognitionRequest = nil
self.recognitionTask = nil
DispatchQueue.main.asyncAfter(deadline: DispatchTime.now() + 1) {
self.startRecording()
}
return
debugPrint("****** recognitionTask != nil *************")
}
let audioSession = AVAudioSession.sharedInstance()
do {
try audioSession.setCategory(AVAudioSession.Category.multiRoute)
try audioSession.setMode(AVAudioSession.Mode.measurement)
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
} catch {
print("audioSession properties weren't set because of an error.")
}
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
guard let recognitionRequest = recognitionRequest else {
fatalError("Unable to create an SFSpeechAudioBufferRecognitionRequest object")
}
recognitionRequest.shouldReportPartialResults = true
recognitionRequest.taskHint = .search
recognitionTask = speechRecognizer?.recognitionTask(with: recognitionRequest, resultHandler: { (result, error) in
var isFinal = false
if result != nil {
self.delegate?.didSpoke(speechRecognizer: self, word: result?.bestTranscription.formattedString)
debugPrint(result?.bestTranscription.formattedString)
isFinal = (result?.isFinal)!
}
if error != nil {
debugPrint("Speech Error ====>",error)
inputNode.removeTap(onBus: 0)
self.audioEngine.stop()
self.recognitionRequest = nil
self.recognitionTask = nil
if BookReadingSettings.isSTTEnable{
DispatchQueue.main.asyncAfter(deadline: DispatchTime.now() + 1) {
self.startRecording()
}
}
// self.microphoneButton.isEnabled = true
}
})
// let recordingFormat = AVAudioFormat.init(commonFormat: .pcmFormatFloat32, sampleRate: <#T##Double#>, interleaved: <#T##Bool#>, channelLayout: <#T##AVAudioChannelLayout#>)//inputNode.outputFormat(forBus: 0)
inputNode.removeTap(onBus: 0)
let sampleRate = AVAudioSession.sharedInstance().sampleRate
let recordingFormat = AVAudioFormat(standardFormatWithSampleRate: sampleRate, channels: 1)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer, when) in
self.recognitionRequest?.append(buffer)
}
audioEngine.prepare()
do {
try audioEngine.start()
} catch {
print("audioEngine couldn't start because of an error.")
}
debugPrint("Say something, I'm listening!")
//textView.text = "Say something, I'm listening!"
}
/* func stopRecording(){
isOn = false
debugPrint("Recording stoped")
self.audioEngine.stop()
recognitionTask?.cancel()
let inputNode = audioEngine.inputNode
inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
}*/
func stopRecording(){
isOn = false
debugPrint("Recording stoped")
let inputNode = audioEngine.inputNode
inputNode.removeTap(onBus: 0)
self.audioEngine.stop()
recognitionTask?.cancel()
self.recognitionRequest = nil
self.recognitionTask = nil
}
}

Related

After twaillio call required condition is false: format.sampleRate == hwFormat.sampleRate'

This is working fine until I make a call using a twillo framework and again start listening. it is crashing with the following error
required condition is false: format.sampleRate == hwFormat.sampleRate'
it is crashing on this line
inputNode?.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) {[weak self] (buffer, when) in
self?.recognitionRequest?.append(buffer)
}
Here is the full code
class MySppechRecognizer: NSObject,SFSpeechRecognizerDelegate {
/// initalize the speech recognitior . it is a shared instace.
static let speechSharedInstance = MySppechRecognizer()
var isSppechRecognisationAvaible = true
var speechRecognizer:SFSpeechRecognizer? = nil
var audioSession = AVAudioSession.sharedInstance()
var audioEngine = AVAudioEngine()
var recognitionTask: SFSpeechRecognitionTask?
var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
var isFinalWord = false
var inputNode:AVAudioInputNode? = nil
var callBack:jimboSpeechCallBack? = nil
var isHotWordDetectedForApp = false
func setSpeechRec() {
if speechRecognizer == nil {
speechRecognizer = SFSpeechRecognizer(locale: kAppLocal)
speechRecognizer?.delegate = self
}
}
}
//MARK:- Delegate
func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
print("Availibility changes")
}
//MARK:- Audio engine
func startRecording(){
if recognitionTask != nil {
self.recognitionRequest?.endAudio()
recognitionTask?.cancel()
recognitionTask = nil
recognitionRequest = nil
inputNode?.reset()
inputNode?.removeTap(onBus: 0)
inputNode?.reset()
stopRecording()
}
do {
try audioSession.setCategory(AVAudioSession.Category.playAndRecord, mode: .measurement)
try audioSession.setMode(.measurement)
try audioSession.setPreferredSampleRate(44100)
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
} catch {
print("audioSession properties weren't set because of an error.")
}
inputNode = audioEngine.inputNode
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
recognitionRequest?.shouldReportPartialResults = true
recognitionTask = MySppechRecognizer.speechSharedInstance.speechRecognizer?.recognitionTask(with: recognitionRequest!, resultHandler: { (result, error) in
print("Result is===\(String(describing: result?.bestTranscription.formattedString))")
var isFinal = false
if result != nil {
isFinal = (result?.isFinal)!
self.isFinalWord = (result?.isFinal)!
}
if error != nil || isFinal {
self.audioEngine.stop()
self.inputNode?.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
}
if error != nil {
print(“Error === \(String(describing: error?.localizedDescription))")
self.isFinalWord = true
}
guard self.callBack == nil else {
self.callBack!(result,error)
return
}
})
let recordingFormat = inputNode?.outputFormat(forBus: 0)
inputNode?.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) {[weak self] (buffer, when) in
self?.recognitionRequest?.append(buffer)
}
audioEngine.prepare()
do {
try audioEngine.start()
} catch {
print("audioEngine couldn't start because of an error.")
}
}
/// to stop the audio session
func stopRecording() {
DispatchQueue.main.async {
if ((self.audioEngine.isRunning)){
self.recognitionRequest?.endAudio()
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest = nil
self.inputNode?.reset()
self.inputNode?.removeTap(onBus: 0)
self.inputNode?.reset()
self.audioEngine.inputNode.reset()
}
}
}
}

Video Not Playing After SpeechRecognizer

Video not playing after SpeechRecognizer. Not getting any error just stuck on AVPlayerViewController. I have stopped speechRecognizer also. then after I am trying to play video. The video perfectly plays before speechRecognizer.
Maybe that possible speechRecognizer is not stopping by this code. So, Maybe the problem is in stopRecording().
#IBAction func btnRecord(_ sender: Any) {
player.pause()
player.seek(to: CMTime.init(value: 0, timescale: player.currentTime().timescale))
if self.audioEngine.isRunning {
self.audioEngine.stop()
self.recognitionRequest?.endAudio()
}
else {
try! self.startRecording()
}
}
private func startRecording() throws {
// Cancel the previous task if it's running.
if let recognitionTask = recognitionTask {
recognitionTask.cancel()
self.recognitionTask = nil
}
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(AVAudioSession.Category.record, mode: .default, options: [])
try audioSession.setMode(AVAudioSession.Mode.measurement)
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
let inputNode = audioEngine.inputNode
//else { fatalError("Audio engine has no input node") }
guard let recognitionRequest = recognitionRequest else { fatalError("Unable to created a SFSpeechAudioBufferRecognitionRequest object") }
// Configure request so that results are returned before audio recording is finished
recognitionRequest.shouldReportPartialResults = true
// A recognition task represents a speech recognition session.
// We keep a reference to the task so that it can be cancelled.
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
var isFinal = false
if let result = result {
self.text = result.bestTranscription.formattedString
self.lblText.text = self.text
isFinal = result.isFinal
}
if error != nil || isFinal {
self.audioEngine.stop()
inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
}
}
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
self.recognitionRequest?.append(buffer)
}
audioEngine.prepare()
try audioEngine.start()
}
private func stopRecording() {
audioEngine.stop()
recognitionRequest?.endAudio()
if let recognitionTask = recognitionTask {
recognitionTask.cancel()
self.recognitionTask = nil
}
}
#IBAction func btnDonePopup(_ sender: Any) {
self.stopRecording()
self.playVideo()
}
Please change audioSession.setCategory to default value:
if error != nil || isFinal {
self.audioEngine.stop()
inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
do {
try audioSession.setCategory(.soloAmbient, mode: .measurement, options: [])
} catch { }
}

Swift - Stop speech recognition on no talk [iOS 10]

I am working on an app that uses the new Speech framework in ios 10 to do some speech-to-text stuff. What is the best way of stopping the recognition when the user stops talking?
private func startRecording() {
isRecording = true
if let recognitionTask = recognitionTask {
recognitionTask.cancel()
self.recognitionTask = nil
}
let audioSession = AVAudioSession.sharedInstance()
do {
try audioSession.setCategory(AVAudioSessionCategoryRecord, mode: AVAudioSessionModeMeasurement)
try audioSession.setActive(true, with: .notifyOthersOnDeactivation)
} catch {
print("audioSession properties weren't set because of an error.")
return
}
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
guard let inputNode = audioEngine.inputNode else {
fatalError("Audio engine has no input node")
}
guard let recognitionRequest = recognitionRequest else {
fatalError("Unable to create an SFSpeechAudioBufferRecognitionRequest object")
}
recognitionRequest.shouldReportPartialResults = true
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest, resultHandler: { (result, error) in
if let result = result {
if error != nil || result.isFinal {
self.audioEngine.stop()
inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
let questionText = result.bestTranscription.formattedString
isRecording = false
self.audioEngine.stop()
recognitionRequest.endAudio()
self.audioEngine.inputNode?.removeTap(onBus: 0)
}
}
})
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
self.recognitionRequest?.append(buffer)
}
audioEngine.prepare()
try! audioEngine.start()
}
I want this code to be called once user does not talk
private func stopRecording() {
isRecording = false
audioEngine.stop()
recognitionRequest?.endAudio()
audioEngine.inputNode?.removeTap(onBus: 0)
}

Voice to String in Swift

the app I'm currently making in Swift will help blind people navigate the world using this one comprehensive solution. I am looking to make a generic function for the app that when called, will immediately start recording, listen for the user to say something, and once the user stops speaking, it will automatically stop recording, convert the recording to a string, and return it. This function should be usable more than once in a single view controller.
I have tried using the technique from this article and it didn't work: https://medium.com/ios-os-x-development/speech-recognition-with-swift-in-ios-10-50d5f4e59c48
The recorder will be collecting the name of a building or a room in a building, so it doesn't need to be recording for terribly long - even a set length of time of 5 seconds would work. I am hoping to use a framework like Speech or something with Siri, but I am not opposed to using an external framework like Watson if it works better. Please help!
There's a beautiful appcoda tutorial here, which fits this perfectly.
This is the code they used to update a text field with the speech results. It can't be too difficult to channel the text going in their text field into whatever variable/function you use to process the result.
//
// ViewController.swift
// Siri
//
// Created by Sahand Edrisian on 7/14/16.
// Copyright © 2016 Sahand Edrisian. All rights reserved.
//
import UIKit
import Speech
class ViewController: UIViewController, SFSpeechRecognizerDelegate {
#IBOutlet weak var textView: UITextView!
#IBOutlet weak var microphoneButton: UIButton!
private let speechRecognizer = SFSpeechRecognizer(locale: Locale.init(identifier: "en-US"))!
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private let audioEngine = AVAudioEngine()
override func viewDidLoad() {
super.viewDidLoad()
microphoneButton.isEnabled = false
speechRecognizer.delegate = self
SFSpeechRecognizer.requestAuthorization { (authStatus) in
var isButtonEnabled = false
switch authStatus {
case .authorized:
isButtonEnabled = true
case .denied:
isButtonEnabled = false
print("User denied access to speech recognition")
case .restricted:
isButtonEnabled = false
print("Speech recognition restricted on this device")
case .notDetermined:
isButtonEnabled = false
print("Speech recognition not yet authorized")
}
OperationQueue.main.addOperation() {
self.microphoneButton.isEnabled = isButtonEnabled
}
}
}
#IBAction func microphoneTapped(_ sender: AnyObject) {
if audioEngine.isRunning {
audioEngine.stop()
recognitionRequest?.endAudio()
microphoneButton.isEnabled = false
microphoneButton.setTitle("Start Recording", for: .normal)
} else {
startRecording()
microphoneButton.setTitle("Stop Recording", for: .normal)
}
}
func startRecording() {
if recognitionTask != nil { //1
recognitionTask?.cancel()
recognitionTask = nil
}
let audioSession = AVAudioSession.sharedInstance() //2
do {
try audioSession.setCategory(AVAudioSessionCategoryRecord)
try audioSession.setMode(AVAudioSessionModeMeasurement)
try audioSession.setActive(true, with: .notifyOthersOnDeactivation)
} catch {
print("audioSession properties weren't set because of an error.")
}
recognitionRequest = SFSpeechAudioBufferRecognitionRequest() //3
guard let inputNode = audioEngine.inputNode else {
fatalError("Audio engine has no input node")
} //4
guard let recognitionRequest = recognitionRequest else {
fatalError("Unable to create an SFSpeechAudioBufferRecognitionRequest object")
} //5
recognitionRequest.shouldReportPartialResults = true //6
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest, resultHandler: { (result, error) in //7
var isFinal = false //8
if result != nil {
self.textView.text = result?.bestTranscription.formattedString //9
isFinal = (result?.isFinal)!
}
if error != nil || isFinal { //10
self.audioEngine.stop()
inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
self.microphoneButton.isEnabled = true
}
})
let recordingFormat = inputNode.outputFormat(forBus: 0) //11
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer, when) in
self.recognitionRequest?.append(buffer)
}
audioEngine.prepare() //12
do {
try audioEngine.start()
} catch {
print("audioEngine couldn't start because of an error.")
}
textView.text = "Say something, I'm listening!"
}
func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
if available {
microphoneButton.isEnabled = true
} else {
microphoneButton.isEnabled = false
}
}
}

iOS: AVSpeechSynthesizer doesn't work after recording with SFSpeechRecognizer

I am making an application that does Text-to-speech and speech-to-text.
The problem i am having right now is that text-to-speech works fine using AVSpeechSynthesizer. But after i record and do speech-to-text using SFSpeechRecognizer, the text-to-speech stops working (ie, doesn't talk back).
I am new to swift too. But i got this code from a couple of different tutorials and tried to merge them together.
Here's my code:
private var speechRecognizer = SFSpeechRecognizer(locale: Locale.init(identifier: "en-US"))!
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var audioEngine = AVAudioEngine()
#objc(speak:location:date:callback:)
func speak(name: String, location: String, date: NSNumber,_ callback: #escaping (NSObject) -> ()) -> Void {
let utterance = AVSpeechUtterance(string: name)
let synthesizer = AVSpeechSynthesizer()
synthesizer.speak(utterance)
}
#available(iOS 10.0, *)
#objc(startListening:location:date:callback:)
func startListening(name: String, location: String, date: NSNumber,_ callback: #escaping (NSObject) -> ()) -> Void {
if audioEngine.isRunning {
audioEngine.stop()
recognitionRequest?.endAudio()
} else {
if recognitionTask != nil { //1
recognitionTask?.cancel()
recognitionTask = nil
}
let audioSession = AVAudioSession.sharedInstance() //2
do {
try audioSession.setCategory(AVAudioSessionCategoryPlayAndRecord)
try audioSession.setMode(AVAudioSessionModeMeasurement)
try audioSession.setActive(true, with: .notifyOthersOnDeactivation)
} catch {
print("audioSession properties weren't set because of an error.")
}
recognitionRequest = SFSpeechAudioBufferRecognitionRequest() //3
guard let inputNode = audioEngine.inputNode else {
fatalError("Audio engine has no input node")
} //4
guard let recognitionRequest = recognitionRequest else {
fatalError("Unable to create an SFSpeechAudioBufferRecognitionRequest object")
} //5
recognitionRequest.shouldReportPartialResults = true //6
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest, resultHandler: { (result, error) in //7
var isFinal = false //8
if result != nil {
print(result?.bestTranscription.formattedString) //9
isFinal = (result?.isFinal)!
}
if error != nil || isFinal { //10
self.audioEngine.stop()
inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
}
})
let recordingFormat = inputNode.outputFormat(forBus: 0) //11
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer, when) in
self.recognitionRequest?.append(buffer)
}
audioEngine.prepare() //12
do {
try audioEngine.start()
} catch {
print("audioEngine couldn't start because of an error.")
}
}
}
They both have an AVAudioSession.
For AVSpeechSynthesizer I suppose it has to be set to:
_audioSession.SetCategory(AVAudioSessionCategory.Playback,
AVAudioSessionCategoryOptions.MixWithOthers);
and For SFSpeechRecognizer:
_audioSession.SetCategory(AVAudioSessionCategory.PlayAndRecord,
AVAudioSessionCategoryOptions.MixWithOthers);
Hope it helps.

Resources