How can I do face detection in realtime just as "Camera" does?
I noticed that AVCaptureStillImageOutput is deprecated after 10.0, so I use
AVCapturePhotoOutput instead. However, I found that the image I saved for facial detection is not so satisfied? Any ideas?
UPDATE
After giving a try of #Shravya Boggarapu mentioned. Currently, I use AVCaptureMetadataOutput to detect the face without CIFaceDetector. It works as expected. However, when I'm trying to draw bounds of the face, it seems mislocated. Any idea?
let metaDataOutput = AVCaptureMetadataOutput()
captureSession.sessionPreset = AVCaptureSessionPresetPhoto
let backCamera = AVCaptureDevice.defaultDevice(withDeviceType: .builtInWideAngleCamera, mediaType: AVMediaTypeVideo, position: .back)
do {
let input = try AVCaptureDeviceInput(device: backCamera)
if (captureSession.canAddInput(input)) {
captureSession.addInput(input)
// MetadataOutput instead
if(captureSession.canAddOutput(metaDataOutput)) {
captureSession.addOutput(metaDataOutput)
metaDataOutput.setMetadataObjectsDelegate(self, queue: DispatchQueue.main)
metaDataOutput.metadataObjectTypes = [AVMetadataObjectTypeFace]
previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
previewLayer?.frame = cameraView.bounds
previewLayer?.videoGravity = AVLayerVideoGravityResizeAspectFill
cameraView.layer.addSublayer(previewLayer!)
captureSession.startRunning()
}
}
} catch {
print(error.localizedDescription)
}
and
extension CameraViewController: AVCaptureMetadataOutputObjectsDelegate {
func captureOutput(_ captureOutput: AVCaptureOutput!, didOutputMetadataObjects metadataObjects: [Any]!, from connection: AVCaptureConnection!) {
if findFaceControl {
findFaceControl = false
for metadataObject in metadataObjects {
if (metadataObject as AnyObject).type == AVMetadataObjectTypeFace {
print("😇😍😎")
print(metadataObject)
let bounds = (metadataObject as! AVMetadataFaceObject).bounds
print("origin x: \(bounds.origin.x)")
print("origin y: \(bounds.origin.y)")
print("size width: \(bounds.size.width)")
print("size height: \(bounds.size.height)")
print("cameraView width: \(self.cameraView.frame.width)")
print("cameraView height: \(self.cameraView.frame.height)")
var face = CGRect()
face.origin.x = bounds.origin.x * self.cameraView.frame.width
face.origin.y = bounds.origin.y * self.cameraView.frame.height
face.size.width = bounds.size.width * self.cameraView.frame.width
face.size.height = bounds.size.height * self.cameraView.frame.height
print(face)
showBounds(at: face)
}
}
}
}
}
Original
see in Github
var captureSession = AVCaptureSession()
var photoOutput = AVCapturePhotoOutput()
var previewLayer: AVCaptureVideoPreviewLayer?
override func viewWillAppear(_ animated: Bool) {
super.viewWillAppear(true)
captureSession.sessionPreset = AVCaptureSessionPresetHigh
let backCamera = AVCaptureDevice.defaultDevice(withMediaType: AVMediaTypeVideo)
do {
let input = try AVCaptureDeviceInput(device: backCamera)
if (captureSession.canAddInput(input)) {
captureSession.addInput(input)
if(captureSession.canAddOutput(photoOutput)){
captureSession.addOutput(photoOutput)
captureSession.startRunning()
previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
previewLayer?.videoGravity = AVLayerVideoGravityResizeAspectFill
previewLayer?.frame = cameraView.bounds
cameraView.layer.addSublayer(previewLayer!)
}
}
} catch {
print(error.localizedDescription)
}
}
func captureImage() {
let settings = AVCapturePhotoSettings()
let previewPixelType = settings.availablePreviewPhotoPixelFormatTypes.first!
let previewFormat = [kCVPixelBufferPixelFormatTypeKey as String: previewPixelType
]
settings.previewPhotoFormat = previewFormat
photoOutput.capturePhoto(with: settings, delegate: self)
}
func capture(_ captureOutput: AVCapturePhotoOutput, didFinishProcessingPhotoSampleBuffer photoSampleBuffer: CMSampleBuffer?, previewPhotoSampleBuffer: CMSampleBuffer?, resolvedSettings: AVCaptureResolvedPhotoSettings, bracketSettings: AVCaptureBracketedStillImageSettings?, error: Error?) {
if let error = error {
print(error.localizedDescription)
}
// Not include previewPhotoSampleBuffer
if let sampleBuffer = photoSampleBuffer,
let dataImage = AVCapturePhotoOutput.jpegPhotoDataRepresentation(forJPEGSampleBuffer: sampleBuffer, previewPhotoSampleBuffer: nil) {
self.imageView.image = UIImage(data: dataImage)
self.imageView.isHidden = false
self.previewLayer?.isHidden = true
self.findFace(img: self.imageView.image!)
}
}
The findFace works with normal image. However, the image I capture via camera will not work or sometimes only recognize one face.
Normal Image
Capture Image
func findFace(img: UIImage) {
guard let faceImage = CIImage(image: img) else { return }
let accuracy = [CIDetectorAccuracy: CIDetectorAccuracyHigh]
let faceDetector = CIDetector(ofType: CIDetectorTypeFace, context: nil, options: accuracy)
// For converting the Core Image Coordinates to UIView Coordinates
let detectedImageSize = faceImage.extent.size
var transform = CGAffineTransform(scaleX: 1, y: -1)
transform = transform.translatedBy(x: 0, y: -detectedImageSize.height)
if let faces = faceDetector?.features(in: faceImage, options: [CIDetectorSmile: true, CIDetectorEyeBlink: true]) {
for face in faces as! [CIFaceFeature] {
// Apply the transform to convert the coordinates
var faceViewBounds = face.bounds.applying(transform)
// Calculate the actual position and size of the rectangle in the image view
let viewSize = imageView.bounds.size
let scale = min(viewSize.width / detectedImageSize.width,
viewSize.height / detectedImageSize.height)
let offsetX = (viewSize.width - detectedImageSize.width * scale) / 2
let offsetY = (viewSize.height - detectedImageSize.height * scale) / 2
faceViewBounds = faceViewBounds.applying(CGAffineTransform(scaleX: scale, y: scale))
print("faceBounds = \(faceViewBounds)")
faceViewBounds.origin.x += offsetX
faceViewBounds.origin.y += offsetY
showBounds(at: faceViewBounds)
}
if faces.count != 0 {
print("Number of faces: \(faces.count)")
} else {
print("No faces 😢")
}
}
}
func showBounds(at bounds: CGRect) {
let indicator = UIView(frame: bounds)
indicator.frame = bounds
indicator.layer.borderWidth = 3
indicator.layer.borderColor = UIColor.red.cgColor
indicator.backgroundColor = .clear
self.imageView.addSubview(indicator)
faceBoxes.append(indicator)
}
There are two ways to detect faces: CIFaceDetector and AVCaptureMetadataOutput. Depending on your requirements, choose what is relevant for you.
CIFaceDetector has more features, it gives you the location of the eyes and mouth, a smile detector, etc.
On the other hand, AVCaptureMetadataOutput is computed on the frames and the detected faces are tracked and there is no extra code to be added by us. I find that, because of tracking. faces are detected more reliably in this process. The downside of this is that you will simply detect faces, no the position of the eyes or mouth.
Another advantage of this method is that orientation issues are smaller as you can use videoOrientation whenever the device orientation changes and the orientation of the faces will relative to that orientation.
In my case, my application uses YUV420 as the required format so using CIDetector (which works with RGB) in real-time was not viable. Using AVCaptureMetadataOutput saved a lot of effort and performed more reliably due to continuous tracking.
Once I had the bounding box for the faces, I coded extra features, such as skin detection and applied it on the still image.
Note: When you capture a still image, the face box information is added along with the metadata so there are no sync issues.
You can also use a combination of the two to get better results.
Explore and evaluate the pros and cons as per your application.
The face rectangle is wrt image origin. So, for the screen, it may be different.
Use:
for (AVMetadataFaceObject *faceFeatures in metadataObjects) {
CGRect face = faceFeatures.bounds;
CGRect facePreviewBounds = CGRectMake(face.origin.y * previewLayerRect.size.width,
face.origin.x * previewLayerRect.size.height,
face.size.width * previewLayerRect.size.height,
face.size.height * previewLayerRect.size.width);
/* Draw rectangle facePreviewBounds on screen */
}
To perform face detection on iOS, there are either CIDetector (Apple)
or Mobile Vision (Google) API.
IMO, Google Mobile Vision provides better performance.
If you are interested, here is the project you can play with. (iOS 10.2, Swift 3)
After WWDC 2017, Apple introduces CoreML in iOS 11.
The Vision framework makes the face detection more accurate :)
I've made a Demo Project. containing Vision v.s. CIDetector. Also, it contains face landmarks detection in real time.
A bit late, but here it is the solution for the coordinates problem. There is a method you can call on the preview layer to transform the metadata object to your coordinate system: transformedMetadataObject(for: metadataObject).
guard let transformedObject = previewLayer.transformedMetadataObject(for: metadataObject) else {
continue
}
let bounds = transformedObject.bounds
showBounds(at: bounds)
Source: https://developer.apple.com/documentation/avfoundation/avcapturevideopreviewlayer/1623501-transformedmetadataobjectformeta
By the way, in case you are using (or upgrade your project to) Swift 4, the delegate method of AVCaptureMetadataOutputsObject has change into:
func metadataOutput(_ output: AVCaptureMetadataOutput, didOutput metadataObjects: [AVMetadataObject], from connection: AVCaptureConnection)
Kind regards
extension CameraViewController: AVCaptureMetadataOutputObjectsDelegate {
func captureOutput(_ captureOutput: AVCaptureOutput!, didOutputMetadataObjects metadataObjects: [Any]!, from connection: AVCaptureConnection!) {
if findFaceControl {
findFaceControl = false
let faces = metadata.flatMap { $0 as? AVMetadataFaceObject } .flatMap { (face) -> CGRect in
guard let localizedFace =
previewLayer?.transformedMetadataObject(for: face) else { return nil }
return localizedFace.bounds }
for face in faces {
let temp = UIView(frame: face)
temp.layer.borderColor = UIColor.white
temp.layer.borderWidth = 2.0
view.addSubview(view: temp)
}
}
}
}
Be sure to remove the views created by didOutputMetadataObjects.
Keeping track of the active facial ids is the best way to do this ^
Also when you're trying to find the location of faces for your preview layer, it is much easier to use facial data and transform. Also I think CIDetector is junk, metadataoutput will use hardware stuff for face detection making it really fast.
Create CaptureSession
For AVCaptureVideoDataOutput create following settings
output.videoSettings = [ kCVPixelBufferPixelFormatTypeKey as AnyHashable: Int(kCMPixelFormat_32BGRA) ]
3.When you receive CMSampleBuffer, create image
DispatchQueue.main.async {
let sampleImg = self.imageFromSampleBuffer(sampleBuffer: sampleBuffer)
self.imageView.image = sampleImg
}
func imageFromSampleBuffer(sampleBuffer : CMSampleBuffer) -> UIImage
{
// Get a CMSampleBuffer's Core Video image buffer for the media data
let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
// Lock the base address of the pixel buffer
CVPixelBufferLockBaseAddress(imageBuffer!, CVPixelBufferLockFlags.readOnly);
// Get the number of bytes per row for the pixel buffer
let baseAddress = CVPixelBufferGetBaseAddress(imageBuffer!);
// Get the number of bytes per row for the pixel buffer
let bytesPerRow = CVPixelBufferGetBytesPerRow(imageBuffer!);
// Get the pixel buffer width and height
let width = CVPixelBufferGetWidth(imageBuffer!);
let height = CVPixelBufferGetHeight(imageBuffer!);
// Create a device-dependent RGB color space
let colorSpace = CGColorSpaceCreateDeviceRGB();
// Create a bitmap graphics context with the sample buffer data
var bitmapInfo: UInt32 = CGBitmapInfo.byteOrder32Little.rawValue
bitmapInfo |= CGImageAlphaInfo.premultipliedFirst.rawValue & CGBitmapInfo.alphaInfoMask.rawValue
//let bitmapInfo: UInt32 = CGBitmapInfo.alphaInfoMask.rawValue
let context = CGContext.init(data: baseAddress, width: width, height: height, bitsPerComponent: 8, bytesPerRow: bytesPerRow, space: colorSpace, bitmapInfo: bitmapInfo)
// Create a Quartz image from the pixel data in the bitmap graphics context
let quartzImage = context?.makeImage();
// Unlock the pixel buffer
CVPixelBufferUnlockBaseAddress(imageBuffer!, CVPixelBufferLockFlags.readOnly);
// Create an image object from the Quartz image
let image = UIImage.init(cgImage: quartzImage!);
return (image);
}
By looking at your code I detected 2 things that could lead to wrong/poor face detection.
One of them is the face detector features options where you are filtering the results by [CIDetectorSmile: true, CIDetectorEyeBlink: true]. Try to set it to nil: faceDetector?.features(in: faceImage, options: nil)
Another guess I have is the result image orientation. I noticed you use AVCapturePhotoOutput.jpegPhotoDataRepresentation method to generate the source image for the detection and the system, by default, it generates that image with a specific orientation, of type Left/LandscapeLeft, I think. So, basically you can tell the face detector to have that in mind by using the CIDetectorImageOrientation key.
CIDetectorImageOrientation: the value for this key is an integer NSNumber from 1..8 such as that found in kCGImagePropertyOrientation. If present, the detection will be done based on that orientation but the coordinates in the returned features will still be based on those of the image.
Try to set it like faceDetector?.features(in: faceImage, options: [CIDetectorImageOrientation: 8 /*Left, bottom*/]).
Related
I'm working on implementing a camera barcode scanner with Vision framework. My setup seems fairly simple: a preview layer for video from the device's rear camera, frames from the video stream are processed using VNDetectBarcodesRequest, and if a barcode is detected, I would like to show a rectangle around it in the preview.
Vision framework coordinates are normalized to the video frame size, so I have to convert them back to the layer coordinates to be able to draw the rectangle. The recommended way to do this is to use AVCaptureVideoPreviewLayer.layerPointConverted(fromCaptureDevicePoint:). There is also a difference between the point of origin (it's in the lower left corner for Vision framework), so I have to take care of that as well. Here's what the result looks like:
let convertPoint = { (point: CGPoint) in
let coordinateFlip = CGAffineTransform.identity
.translatedBy(x: 0, y: 1)
.scaledBy(x: 1, y: -1)
return videoLayer.layerPointConverted(fromCaptureDevicePoint: point.applying(coordinateFlip))
}
The problem is that this conversion is incorrect: the rectangle ends up being rotated 90º clockwise:
I have no idea why, I set video orientation to .portrait on both AVCaptureVideoPreviewLayer.connection and AVCaptureVideoDataOutput.connection(with: .video). The Vision coordinates seem to be correct, if I convert them manually like so:
extension VNBarcodeObservation {
func verticesConvertedToCoordinatesOf(_ targetLayer: CALayer) -> [CGPoint] {
let layerSize = targetLayer.bounds.size
return [topLeft, topRight, bottomRight, bottomLeft]
.map { CGPoint(x: $0.x * layerSize.width, y: $0.y * layerSize.height) }
}
}
the result is correct:
Why is AVCaptureVideoPreviewLayer.layerPointConverted(fromCaptureDevicePoint:) returning incorrect results?
It seems like somehow AVCaptureVideoPreviewLayer is stuck in the .landscapeRight orientation.
Here's how I set up video capture:
let captureSessionQueue = DispatchQueue.global(qos: .userInteractive)
let camera = AVCaptureDevice.default(for: .video)
let session = AVCaptureSession()
session.beginConfiguration()
session.setCamera(camera, preferredPresets: [.hd1280x720])
let request = VNDetectBarcodesRequest(symbologies: VNDetectBarcodesRequest.supportedSymbologies)
request.regionOfInterest = CGRect(x: 0.15, y: 0.45, width: 0.7, height: 0.4)
let delegate = CaptureVideoDataOutputDelegate()
delegate.vnRequests = [request]
let deviceOutput = AVCaptureVideoDataOutput()
deviceOutput.setSampleBufferDelegate(delegate, queue: captureSessionQueue)
deviceOutput.alwaysDiscardsLateVideoFrames = true
session.addOutput(deviceOutput)
deviceOutput.connection(with: .video)?.videoOrientation = .portrait
session.commitConfiguration()
captureSessionQueue.async {
session.startRunning()
}
and CaptureVideoDataOutputDelegate:
public class CaptureVideoDataOutputDelegate: NSObject, AVCaptureVideoDataOutputSampleBufferDelegate {
public var vnRequests: [VNRequest]?
private let vnHandler = VNSequenceRequestHandler()
private var frameCounter: Int64 = 0
public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
defer { frameCounter += 1 }
guard
// Try to reduce CPU usage and energy consumption by processing only every 10th frame
frameCounter.isMultiple(of: 10),
let vnRequests = vnRequests,
let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer)
else { return }
do {
let size = CGSize(width: CVPixelBufferGetWidth(pixelBuffer), height: CVPixelBufferGetHeight(pixelBuffer))
print(">>> will scan a \(size) frame #\(frameCounter) for barcodes (connection: \(connection), orientation: \(connection.videoOrientation))")
try vnHandler.perform(vnRequests, on: pixelBuffer, orientation: .up)
} catch {
print(">>> failed to scan frame #\(frameCounter) for barcodes: \(error)")
}
}
}
I have verified that both deviceOutput.connection(with: .video) and videoLayer.connection are not nil when I try to configure video orientation. Examining the size of buffer received by CaptureVideoDataOutputDelegate confirms that orientation is correct (the size is 720x1280, not 1280x720). Orientation of the connection is also .portrait at this point.
I have very strange case where AVCaptureVideoDataOutputSampleBufferDelegate drops frames if I use 13 different filter chains. Let me explain:
I have CameraController setup, nothing special, here is my delegate method:
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
if !paused {
if connection.output?.connection(with: .audio) == nil {
//capture video
// my try to avoid "Out of buffers error", no luck ;(
lastCapturedBuffer = nil
let err = CMSampleBufferCreateCopy(allocator: kCFAllocatorDefault, sampleBuffer: sampleBuffer, sampleBufferOut: &lastCapturedBuffer)
if err == noErr {
}
connection.videoOrientation = .portrait
// getting image
let pixelBuffer = CMSampleBufferGetImageBuffer(lastCapturedBuffer!)
// remove if any
CVPixelBufferLockBaseAddress(pixelBuffer!, CVPixelBufferLockFlags(rawValue: 0))
// captured - is just ciimage property
captured = CIImage(cvPixelBuffer: pixelBuffer!)
//remove if any
CVPixelBufferUnlockBaseAddress(pixelBuffer!,CVPixelBufferLockFlags(rawValue: 0))
//CVPixelBufferUnlockBaseAddress(pixelBuffer!, .readOnly)
// transform image to targer resolution
let srcWidth = CGFloat(captured.extent.width)
let srcHeight = CGFloat(captured.extent.height)
let dstWidth: CGFloat = ConstantsManager.shared.k_video_width
let dstHeight: CGFloat = ConstantsManager.shared.k_video_height
let scaleX = dstWidth / srcWidth
let scaleY = dstHeight / srcHeight
var transform = CGAffineTransform.init(scaleX: scaleX, y: scaleY)
captured = captured.transformed(by: transform).cropped(to: CGRect(x: 0, y: 0, width: dstWidth, height: dstHeight))
// mirror for front camera
if front {
var t = CGAffineTransform.init(scaleX: -1, y: 1)
t = t.translatedBy(x: -ConstantsManager.shared.k_video_width, y: 0)
captured = captured.transformed(by: t)
}
// video capture logic
let writable = canWrite()
if writable,
sessionAtSourceTime == nil {
sessionAtSourceTime = CMSampleBufferGetPresentationTimeStamp(lastCapturedBuffer!)
videoWriter.startSession(atSourceTime: sessionAtSourceTime!)
}
if writable, (videoWriterInput.isReadyForMoreMediaData) {
videoWriterInput.append(lastCapturedBuffer!)
}
// apply effect in realtime <- here is problem. If I comment next line, it will be fixed but effect will n't be applied
captured = FilterManager.shared.applyFilterForCamera(inputImage: captured)
// current frame in case user wants to save image as photo
self.capturedPhoto = captured
// sent frame to Camcoder view controller
self.delegate?.didCapturedFrame(frame: captured)
} else {
// capture sound
let writable = canWrite()
if writable, (audioWriterInput.isReadyForMoreMediaData) {
//print("write audio buffer")
audioWriterInput?.append(lastCapturedBuffer!)
}
}
} else {
// paused
}
}
I also implemented didDrop delegate method, here is how I figure out why it drops frames:
func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
print("did drop")
var mode: CMAttachmentMode = 0
let reason = CMGetAttachment(sampleBuffer, key: kCMSampleBufferAttachmentKey_DroppedFrameReason, attachmentModeOut: &mode)
print("reason \(String(describing: reason))") // Optional(OutOfBuffers)
}
So I did it like a pro and just commented parts of code to find where is the problem. So, it here:
captured = FilterManager.shared.applyFilterForCamera(inputImage: captured)
FilterManager - is singleton, here is called func:
func applyFilterForCamera(inputImage: CIImage) -> CIImage {
return currentVsFilter!.apply(sourceImage: inputImage)
}
currentVsFilter is object of VSFilter type - here is example of one:
import Foundation
import AVKit
class TestFilter: CustomFilter {
let _name = "Тестовый Фильтр"
let _displayName = "Test Filter"
var tempImage: CIImage?
var final: CGImage?
override func name() -> String {
return _name
}
override func displayName() -> String {
return _displayName
}
override init() {
super.init()
print("Test Filter init")
// setup my custom kernel filter
self.noise.type = GlitchFilter.GlitchType.allCases[2]
}
// this returns composition for playback using AVPlayer
override func composition(asset: AVAsset) -> AVMutableVideoComposition {
let composition = AVMutableVideoComposition(asset: asset, applyingCIFiltersWithHandler: { request in
let inputImage = request.sourceImage.cropped(to: request.sourceImage.extent)
DispatchQueue.global(qos: .userInitiated).async {
let output = self.apply(sourceImage: inputImage, forComposition: true)
request.finish(with: output, context: nil)
}
})
let size = FilterManager.shared.cropRectForOrientation().size
composition.renderSize = size
return composition
}
// this returns actual filtered CIImage, used for both AVPlayer composition and realtime camera
override func apply(sourceImage: CIImage, forComposition: Bool = false) -> CIImage {
// rendered text
tempImage = FilterManager.shared.textRenderedImage()
// some filters chained one by one
self.screenBlend?.setValue(tempImage, forKey: kCIInputImageKey)
self.screenBlend?.setValue(sourceImage, forKey: kCIInputBackgroundImageKey)
self.noise.inputImage = self.screenBlend?.outputImage
self.noise.inputAmount = CGFloat.random(in: 1.0...3.0)
// result
tempImage = self.noise.outputImage
// correct crop
let rect = forComposition ? FilterManager.shared.cropRectForOrientation() : FilterManager.shared.cropRect
final = self.context.createCGImage(tempImage!, from: rect!)
return CIImage(cgImage: final!)
}
}
And now, the most strange thing, I have 30 VSFilters and when I got to 13(switching one by one by UIButton) I got error "Out of Buffer", this one:
kCMSampleBufferDroppedFrameReason_OutOfBuffers
What I tested:
I changed vsFilters order in filters array inside FilterManager singleton - same
I tried switch from first to 12 one by one, then go back - works, but after I switched to 13tn(of 30th from 0) - bug
Looks like it can handle only 12 VSFIlter objects, like if it retains them somehow or maybe it's related to threading, I don't know.
This app made for iOs devices, tested on iPhone X iOs 13.3.1
This is video editor app to apply different effects to both live stream from camera and video files from camera roll
Maybe someone has experience with this?
Have a great day
Best, Victor
Edit 1. If I reinit cameraController(AVCaptureSession. input/output devices) it works but this is ugly option and it adds lag when switching filters
Ok, so I finally won this battle. In case some one else get this "OutOfBuffer" problem, here is my solution
As I figured out, CIFilter grabs CVPixelBuffer and don't release it while filtering images. It's kinda creates one huge buffer, I guess. Strange thing: it don't create memory leak, so I guess it grabs not particular buffer but creates strong reference to it. As rumors(me) say, it can handle only 12 such references.
So, my approach was to copy CVPixelBuffer and then work with it instead of buffer I got from AVCaptureVideoDataOutputSampleBufferDelegate didOutput func
Here is my new code:
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
if !paused {
//print("camera controller \(id) got frame")
if connection.output?.connection(with: .audio) == nil {
//capture video
connection.videoOrientation = .portrait
// getting image
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
// this works!
let copyBuffer = pixelBuffer.copy()
// captured - is just ciimage property
captured = CIImage(cvPixelBuffer: copyBuffer)
//remove if any
// transform image to targer resolution
let srcWidth = CGFloat(captured.extent.width)
let srcHeight = CGFloat(captured.extent.height)
let dstWidth: CGFloat = ConstantsManager.shared.k_video_width
let dstHeight: CGFloat = ConstantsManager.shared.k_video_height
let scaleX = dstWidth / srcWidth
let scaleY = dstHeight / srcHeight
var transform = CGAffineTransform.init(scaleX: scaleX, y: scaleY)
captured = captured.transformed(by: transform).cropped(to: CGRect(x: 0, y: 0, width: dstWidth, height: dstHeight))
// mirror for front camera
if front {
var t = CGAffineTransform.init(scaleX: -1, y: 1)
t = t.translatedBy(x: -ConstantsManager.shared.k_video_width, y: 0)
captured = captured.transformed(by: t)
}
// video capture logic
let writable = canWrite()
if writable,
sessionAtSourceTime == nil {
sessionAtSourceTime = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
videoWriter.startSession(atSourceTime: sessionAtSourceTime!)
}
if writable, (videoWriterInput.isReadyForMoreMediaData) {
videoWriterInput.append(sampleBuffer)
}
self.captured = FilterManager.shared.applyFilterForCamera(inputImage: self.captured)
// current frame in case user wants to save image as photo
self.capturedPhoto = captured
// sent frame to Camcoder view controller
self.delegate?.didCapturedFrame(frame: captured)
} else {
// capture sound
let writable = canWrite()
if writable, (audioWriterInput.isReadyForMoreMediaData) {
//print("write audio buffer")
audioWriterInput?.append(sampleBuffer)
}
}
} else {
// paused
//print("paused camera controller \(id)")
}
}
and there is func to copy buffer:
func copy() -> CVPixelBuffer {
precondition(CFGetTypeID(self) == CVPixelBufferGetTypeID(), "copy() cannot be called on a non-CVPixelBuffer")
var _copy : CVPixelBuffer?
CVPixelBufferCreate(
kCFAllocatorDefault,
CVPixelBufferGetWidth(self),
CVPixelBufferGetHeight(self),
CVPixelBufferGetPixelFormatType(self),
nil,
&_copy)
guard let copy = _copy else { fatalError() }
CVPixelBufferLockBaseAddress(self, CVPixelBufferLockFlags.readOnly)
CVPixelBufferLockBaseAddress(copy, CVPixelBufferLockFlags(rawValue: 0))
let copyBaseAddress = CVPixelBufferGetBaseAddress(copy)
let currBaseAddress = CVPixelBufferGetBaseAddress(self)
print("copy data size: \(CVPixelBufferGetDataSize(copy))")
print("self data size: \(CVPixelBufferGetDataSize(self))")
memcpy(copyBaseAddress, currBaseAddress, CVPixelBufferGetDataSize(copy))
//memcpy(copyBaseAddress, currBaseAddress, CVPixelBufferGetDataSize(self) * 2)
CVPixelBufferUnlockBaseAddress(copy, CVPixelBufferLockFlags(rawValue: 0))
CVPixelBufferUnlockBaseAddress(self, CVPixelBufferLockFlags.readOnly)
return copy
}
I used it as extension
I hope, this will help anyone with similar problem
Best, Victor
I am having an issue where the Depth Data for the .builtInDualCamera appears to be rotated 90 degrees when isFilteringEnabled = true
Here is my code:
fileprivate let session = AVCaptureSession()
fileprivate let meta = AVCaptureMetadataOutput()
fileprivate let video = AVCaptureVideoDataOutput()
fileprivate let depth = AVCaptureDepthDataOutput()
fileprivate let camera: AVCaptureDevice
fileprivate let input: AVCaptureDeviceInput
fileprivate let synchronizer: AVCaptureDataOutputSynchronizer
init(delegate: CaptureSessionDelegate?) throws {
self.delegate = delegate
session.sessionPreset = .vga640x480
// Setup Camera Input
let discovery = AVCaptureDevice.DiscoverySession(deviceTypes: [.builtInDualCamera], mediaType: .video, position: .unspecified)
if let device = discovery.devices.first {
camera = device
} else {
throw SessionError.CameraNotAvailable("Unable to load camera")
}
input = try AVCaptureDeviceInput(device: camera)
session.addInput(input)
// Setup Metadata Output (Face)
session.addOutput(meta)
if meta.availableMetadataObjectTypes.contains(AVMetadataObject.ObjectType.face) {
meta.metadataObjectTypes = [ AVMetadataObject.ObjectType.face ]
} else {
print("Can't Setup Metadata: \(meta.availableMetadataObjectTypes)")
}
// Setup Video Output
video.videoSettings = [kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA]
session.addOutput(video)
video.connection(with: .video)?.videoOrientation = .portrait
// ****** THE ISSUE IS WITH THIS BLOCK HERE ******
// Setup Depth Output
depth.isFilteringEnabled = true
session.addOutput(depth)
depth.connection(with: .depthData)?.videoOrientation = .portrait
// Setup Synchronizer
synchronizer = AVCaptureDataOutputSynchronizer(dataOutputs: [depth, video, meta])
let outputRect = CGRect(x: 0, y: 0, width: 1, height: 1)
let videoRect = video.outputRectConverted(fromMetadataOutputRect: outputRect)
let depthRect = depth.outputRectConverted(fromMetadataOutputRect: outputRect)
// Ratio of the Depth to Video
scale = max(videoRect.width, videoRect.height) / max(depthRect.width, depthRect.height)
// Set Camera to the framerate of the Depth Data Collection
try camera.lockForConfiguration()
if let fps = camera.activeDepthDataFormat?.videoSupportedFrameRateRanges.first?.minFrameDuration {
camera.activeVideoMinFrameDuration = fps
}
camera.unlockForConfiguration()
super.init()
synchronizer.setDelegate(self, queue: syncQueue)
}
func dataOutputSynchronizer(_ synchronizer: AVCaptureDataOutputSynchronizer, didOutput data: AVCaptureSynchronizedDataCollection) {
guard let delegate = self.delegate else {
return
}
// Check to see if all the data is actually here
guard
let videoSync = data.synchronizedData(for: video) as? AVCaptureSynchronizedSampleBufferData,
!videoSync.sampleBufferWasDropped,
let depthSync = data.synchronizedData(for: depth) as? AVCaptureSynchronizedDepthData,
!depthSync.depthDataWasDropped
else {
return
}
// It's OK if the face isn't found.
let face: AVMetadataFaceObject?
if let metaSync = data.synchronizedData(for: meta) as? AVCaptureSynchronizedMetadataObjectData {
face = (metaSync.metadataObjects.first { $0 is AVMetadataFaceObject }) as? AVMetadataFaceObject
} else {
face = nil
}
// Convert Buffers to CIImage
let videoImage = convertVideoImage(fromBuffer: videoSync.sampleBuffer)
let depthImage = convertDepthImage(fromData: depthSync.depthData, andFace: face)
// Call Delegate
delegate.captureImages(video: videoImage, depth: depthImage, face: face)
}
fileprivate func convertVideoImage(fromBuffer sampleBuffer: CMSampleBuffer) -> CIImage {
// Convert from "CoreMovie?" to CIImage - fairly straight-forward
let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer)
let image = CIImage(cvPixelBuffer: pixelBuffer!)
return image
}
fileprivate func convertDepthImage(fromData depthData: AVDepthData, andFace face: AVMetadataFaceObject?) -> CIImage {
var convertedDepth: AVDepthData
// Convert 16-bif floats up to 32
if depthData.depthDataType != kCVPixelFormatType_DisparityFloat32 {
convertedDepth = depthData.converting(toDepthDataType: kCVPixelFormatType_DisparityFloat32)
} else {
convertedDepth = depthData
}
// Pixel buffer comes straight from depthData
let pixelBuffer = convertedDepth.depthDataMap
let image = CIImage(cvPixelBuffer: pixelBuffer)
return image
}
The original Video Looks like this: (For reference)
When the values are:
// Setup Depth Output
depth.isFilteringEnabled = false
depth.connection(with: .depthData)?.videoOrientation = .portrait
The Image looks like this: (you can see the closer jacket is white, the farther jacket is grey, and the distance is dark grey - as expected)
When the values are:
// Setup Depth Output
depth.isFilteringEnabled = true
depth.connection(with: .depthData)?.videoOrientation = .portrait
The image looks like this: (You can see the color values appear to be in the right places, but the shapes in the smoothing filter appear to be rotated)
When the values are:
// Setup Depth Output
depth.isFilteringEnabled = true
depth.connection(with: .depthData)?.videoOrientation = .landscapeRight
The image looks like this: (Both the colors and the shapes appear to be horizontal)
Am I doing something wrong to get these incorrect values?
I have tried re-ordering the code
// Setup Depth Output
depth.connection(with: .depthData)?.videoOrientation = .portrait
depth.isFilteringEnabled = true
But that does nothing.
I think this is an issue related to iOS 12, because I remember this working just fine under iOS 11 (although I don't have any images saved to prove it)
Any Help is appreciated, thanks!
Unlike the suggestion to review other answers on rotating the image after creation, which I found did not work, in the AVDepthData documentation, there is a method available that does the orientation correction for you.
The method is called: depthDataByApplyingExifOrientation: which returns an instance of AVDepthData with the orientation applied, ie. you can create your image in the correct orientation you desire by passing in the parameter of your choice.
This is my helper method that returns a UIImage with the orientation fix.
- (UIImage *)createDepthMapImageFromCapturePhoto:(AVCapturePhoto *)photo {
// AVCapturePhoto which has depthData - in swift you should confirm this exists
AVDepthData *frontDepthData = [photo depthData];
// Overwrite the instance with the correct orientation applied.
frontDepthData = [frontDepthData depthDataByApplyingExifOrientation:kCGImagePropertyOrientationRight];
// Create the CIImage from the depth data using the available method.
CIImage *ciDepthImage = [CIImage imageWithDepthData:frontDepthData];
// Create CIContext which enables converting CIImage to CGImage
CIContext *context = [[CIContext alloc] init];
// Create the CGImage
CGImageRef img = [context createCGImage:ciDepthImage fromRect:[ciDepthImage extent]];
// Create the final image.
UIImage *depthImage = [UIImage imageWithCGImage:img];
// Return the depth image.
return depthImage;
}
I played around with AVFoundation try to apply filter to live video. I tried to apply filter to AVCaptureVideoDataOutput, but the output occupied only 1/4 of the view.
Here are some of my related code
Capturing
let availableCameraDevices = AVCaptureDevice.devicesWithMediaType(AVMediaTypeVideo)
for device in availableCameraDevices as! [AVCaptureDevice] {
if device.position == .Back {
backCameraDevice = device
} else if device.position == .Front {
frontCameraDevice = device
}
}
Configure output
private func configureVideoOutput() {
videoOutput = AVCaptureVideoDataOutput()
videoOutput?.setSampleBufferDelegate(self, queue: dispatch_queue_create("sample buffer delegate", DISPATCH_QUEUE_SERIAL))
if session.canAddOutput(videoOutput) {
session.addOutput(videoOutput)
}
}
Get image
func captureOutput(captureOutput: AVCaptureOutput!, didOutputSampleBuffer sampleBuffer:
CMSampleBuffer!, fromConnection connection: AVCaptureConnection!) {
// Grab the pixelbuffer
let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer)!
// create a CIImage from it, rotate it, and zero the origin
var image = CIImage(CVPixelBuffer: pixelBuffer)
image = image.imageByApplyingTransform(CGAffineTransformMakeRotation(CGFloat(-M_PI_2)))
let origin = image.extent.origin
image = image.imageByApplyingTransform(CGAffineTransformMakeTranslation(-origin.x, -origin.y))
self.manualDelegate?.cameraController(self, didOutputImage: image)
}
Render
func cameraController(cameraController: CameraController, didOutputImage image: CIImage) {
if glContext != EAGLContext.currentContext() {
EAGLContext.setCurrentContext(glContext)
}
let filteredImage = image.imageByApplyingFilter("CIColorControls", withInputParameters: [kCIInputSaturationKey: 0.0])
var rect = view.bounds
glView.bindDrawable()
ciContext.drawImage(filteredImage, inRect: rect, fromRect: image.extent)
glView.display()
}
I expected retina display and scale factor causing this, but don't sure where should I deal with this. I already set content scale factor to GLKView, but no luck.
private var glView: GLKView {
// Set in storyboard
return view as! GLKView
}
glView.contentScaleFactor = glView.bounds.size.width / UIScreen.mainScreen().bounds.size.width * UIScreen.mainScreen().scale
Your problem is with the output rect used in the drawImage function:
ciContext.drawImage(filteredImage, inRect: rect, fromRect: image.extent)
The image's extent is in actual pixels, while the view's bounds are points, and are not adjusted by the contentScaleFactor to get pixels. Your device undoubtedly has a contentScaleFactor of 2.0, thus it's 1/2 the size in each dimension.
Instead, set the rect as:
var rect = CGRect(x: 0, y: 0, width: glView.drawableWidth,
height: glView.drawableHeight)
drawableWidth and drawableHeight return the dimension in pixels, accounting for the contentScaleFactor. See:
https://developer.apple.com/reference/glkit/glkview/1615591-drawablewidth
Also, there is no need to set glView's contentScaleFactor
Which format are you setting before starting the capture? Are you sure that the video preview layer is filling the whole screen?
You have 2 ways to set resolution during an avcapture:
Choose the AVCaptureDeviceFormat with the highest resolution, by looking through available capture format
Using the sessionPreset property of you capture session. Doc here.
I am developing an application to detect rectangles in a live camera feed and highlight the detected rectangle. I did camera thing using AVFoundation and used below methods in order to do detect and highlight the detected rectangle.
var detector: CIDetector?;
override func viewDidLoad() {
super.viewDidLoad();
detector = self.prepareRectangleDetector();
}
func captureOutput(captureOutput: AVCaptureOutput!, didOutputSampleBuffer sampleBuffer: CMSampleBuffer!, fromConnection connection: AVCaptureConnection!) { // re check this method
// Need to shimmy this through type-hell
let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer)
// Force the type change - pass through opaque buffer
let opaqueBuffer = Unmanaged<CVImageBuffer>.passUnretained(imageBuffer!).toOpaque()
let pixelBuffer = Unmanaged<CVPixelBuffer>.fromOpaque(opaqueBuffer).takeUnretainedValue()
let sourceImage = CIImage(CVPixelBuffer: pixelBuffer, options: nil)
// Do some detection on the image
self.performRectangleDetection(sourceImage);
var outputImage = sourceImage
// Do some clipping
var drawFrame = outputImage.extent
let imageAR = drawFrame.width / drawFrame.height
let viewAR = videoDisplayViewBounds.width / videoDisplayViewBounds.height
if imageAR > viewAR {
drawFrame.origin.x += (drawFrame.width - drawFrame.height * viewAR) / 2.0
drawFrame.size.width = drawFrame.height / viewAR
} else {
drawFrame.origin.y += (drawFrame.height - drawFrame.width / viewAR) / 2.0
drawFrame.size.height = drawFrame.width / viewAR
}
//videoDisplayView is a GLKView which is used to display camera feed
videoDisplayView.bindDrawable()
if videoDisplayView.context != EAGLContext.currentContext() {
EAGLContext.setCurrentContext(videoDisplayView.context)
}
// clear eagl view to grey
glClearColor(0.5, 0.5, 0.5, 1.0);
glClear(0x00004000)
// set the blend mode to "source over" so that CI will use that
glEnable(0x0BE2);
glBlendFunc(1, 0x0303);
renderContext.drawImage(outputImage, inRect: videoDisplayViewBounds, fromRect: drawFrame);
videoDisplayView.display();
}
func prepareRectangleDetector() -> CIDetector {
let options: [String: AnyObject] = [CIDetectorAccuracy: CIDetectorAccuracyHigh];
return CIDetector(ofType: CIDetectorTypeRectangle, context: nil, options: options);
}
func performRectangleDetection(image: CIImage){
let resultImage: CIImage? = nil;
if let detector = detector {
// Get the detections
let features = detector.featuresInImage(image, options: [CIDetectorAspectRatio:NSNumber(float:1.43)]);
if features.count != 0{ // feature found
for feature in features as! [CIRectangleFeature] {
self.previewImageView.layer.sublayers = nil;
let line: CAShapeLayer = CAShapeLayer();
line.frame = self.videoDisplayView.bounds;
let linePath: UIBezierPath = UIBezierPath();
linePath.moveToPoint(feature.topLeft);
linePath.addLineToPoint(feature.topRight);
linePath.addLineToPoint(feature.bottomRight);
linePath.addLineToPoint(feature.bottomLeft);
linePath.addLineToPoint(feature.topLeft);
linePath.closePath();
line.lineWidth = 5.0;
line.path = linePath.CGPath;
line.fillColor = UIColor.clearColor().CGColor;
line.strokeColor = UIColor(netHex: 0x3399CC, alpha: 1.0).CGColor;
// videoDisplayParentView is the parent of videoDisplayView and they both have same bounds
self.videoDisplayParentView.layer.addSublayer(line);
}
}
}
}
I used CAShapeLayer and UIBezierPath to draw the rectangle. This is very very slow. Path gets visible after minutes.
Can Someone please help me to figure out why it is slow or let me know if I am doing something wrong here. Any help would be highly appreciated.
Or if there is some way easy than this I would like to know it too.
If you get into the business of adding a sublayer to a GLKView it will be slow. The GLKView here refreshes multiple times every second (as it is in captureOutput:didOutputSampleBuffer:.. method), the process of creating and adding the sublayer every time will not be able to keep up with.
A better way is to draw the path using CoreImage and compositing it over resultImage.