My app generates ply files after scanning objects. After ios 14 update the color of my 3d models does not load correctly. Also I am unable to view ply files in xcode (works fine in preview).
Anyone know the workaround to this problem?
I tired reading ply file content and display vertices and faces in scene geometry but it takes too long to load a file.
Apparently creating mdlAsset() throws some Metal warning and the mesh color does not show up properly.
Here are the sample images from ios 13 and 14 preview in sceneKit.
same problem,i found it is a SceneKit's bug, i had a solution that read .ply file with C, and creat a SCNGeometry instance with data, main code:
first we need read vertexCount and faceCount in .ply file (my file is ASCII format,so, )
bool readFaceAndVertexCount(char* filePath, int *vertexCount, int *faceCount);
example:
bool readFaceAndVertexCount(char* filePath, int *vertexCount, int *faceCount) {
char data[100];
FILE *fp;
if((fp = fopen(filePath,"r")) == NULL)
{
printf("error!");
return false;
}
while (!feof(fp))
{
fgets(data,1024,fp);
unsigned long i = strlen(data);
data[i - 1] = '\0';
if (strstr(data, "element vertex") != NULL) {
char *res = strtok(data," ");
while (res != NULL) {
res = strtok(NULL, " ");
if (res != NULL) {
*vertexCount = atoi(res);
}
}
}
if (strstr(data, "element face") != NULL) {
char *res = strtok(data," ");
while (res != NULL) {
res = strtok(NULL, " ");
if (res != NULL) {
*faceCount = atoi(res);
}
}
}
if (*faceCount > 0 && *vertexCount > 0) {
break;
}
}
fclose(fp);
return true;
}
2, read data to array:
in .c
// you need to implement with your files
bool readPlyFile(char* filePath, const int vertexCount, int faceCount, float *vertex, float *color, int *elment)
in .swift:
var vertex: [Float] = Array.init(repeating: 0, count: Int(vertexCount) * 3)
var color: [Float] = Array.init(repeating: 0, count: Int(vertexCount) * 3)
var face: [Int32] = Array.init(repeating: 0, count: Int(faceCount) * 3)
readPlyFile(UnsafeMutablePointer<Int8>(mutating: url.path),vertexCount,faceCount,&vertex,&color,&face)
3 creat a custom SCNGeometry:
let positionData = NSData.init(bytes: vertex, length: MemoryLayout<Float>.size * vertex.count)
let vertexSource = SCNGeometrySource.init(data: positionData as Data, semantic: .vertex, vectorCount: Int(vertexCount), usesFloatComponents: true, componentsPerVector: 3, bytesPerComponent: MemoryLayout<Float>.size, dataOffset: 0, dataStride: MemoryLayout<Float>.size * 3)
let colorData = NSData.init(bytes: color, length: MemoryLayout<Float>.size * color.count)
let colorSource = SCNGeometrySource.init(data: colorData as Data, semantic: .color, vectorCount: Int(vertexCount), usesFloatComponents: true, componentsPerVector: 3, bytesPerComponent: MemoryLayout<Float>.size, dataOffset: 0, dataStride: MemoryLayout<Float>.size * 3)
let indexData = NSData(bytes: face, length: MemoryLayout<Int32>.size * face.count)
let element = SCNGeometryElement(data: indexData as Data, primitiveType: SCNGeometryPrimitiveType.triangles, primitiveCount: Int(faceCount), bytesPerIndex: MemoryLayout<Int32>.size)
let gemetry = SCNGeometry.init(sources: [vertexSource,colorSource], elements: [element])
let node = SCNNode.init(geometry: gemetry)
let scene = SCNScene.init()
node.geometry?.firstMaterial?.cullMode = .back
node.geometry?.firstMaterial?.isDoubleSided = true
scene.rootNode.addChildNode(node)
scnView.scene = scene
it work! and faster!
Related
I've written some simple multisampled rendering in Metal. It's just drawing a single solid colored quad. After rendering I read the contents of the resolve texture. This works on Intel and M1 but fails on AMD and NVidia.
Any idea what I'm doing wrong? Metal's API Validation doesn't complain about anything :(
//
// Renderer.swift
// metaltest
//
import Foundation
import Metal
import MetalKit
class Renderer : NSObject, MTKViewDelegate {
let device: MTLDevice
let commandQueue: MTLCommandQueue
let pipelineState: MTLRenderPipelineState
let vertexBuffer: MTLBuffer
let texture: MTLTexture
let resolveTexture: MTLTexture
let width = 16;
let height = 16;
//let samplerState: MTLSamplerState
var frameCount: Int = 0
// This is the initializer for the Renderer class.
// We will need access to the mtkView later, so we add it as a parameter here.
init?(mtkView: MTKView) {
device = mtkView.device!
mtkView.framebufferOnly = true
commandQueue = device.makeCommandQueue()!
// Create the Render Pipeline
do {
pipelineState = try Renderer.buildRenderPipelineWith(device: device, metalKitView: mtkView)
} catch {
print("Unable to compile render pipeline state: \(error)")
return nil
}
// Create our vertex data
let vertices = [
Vertex(pos: [-1, -1]),
Vertex(pos: [ 1, -1]),
Vertex(pos: [-1, 1]),
Vertex(pos: [-1, 1]),
Vertex(pos: [ 1, -1]),
Vertex(pos: [ 1, 1]),
]
// And copy it to a Metal buffer...
vertexBuffer = device.makeBuffer(bytes: vertices, length: vertices.count * MemoryLayout<Vertex>.stride, options: [])!
print("texture size: width: \(width), height: \(height)")
let textureDescriptor = MTLTextureDescriptor.texture2DDescriptor(
pixelFormat: MTLPixelFormat.rgba8Unorm,
width: width,
height: height,
mipmapped: false)
textureDescriptor.sampleCount = 4
textureDescriptor.usage = [.renderTarget]
textureDescriptor.textureType = .type2DMultisample
textureDescriptor.storageMode = .private
texture = device.makeTexture(descriptor: textureDescriptor)!
let resolveTextureDescriptor = MTLTextureDescriptor.texture2DDescriptor(
pixelFormat: MTLPixelFormat.rgba8Unorm,
width: width,
height: height,
mipmapped: false)
resolveTextureDescriptor.usage = [.renderTarget]
resolveTexture = device.makeTexture(descriptor: resolveTextureDescriptor)!
}
// Create our custom rendering pipeline, which loads shaders using `device`, and outputs to the format of `metalKitView`
class func buildRenderPipelineWith(device: MTLDevice, metalKitView: MTKView) throws -> MTLRenderPipelineState {
// Create a new pipeline descriptor
let pipelineDescriptor = MTLRenderPipelineDescriptor()
// Setup the shaders in the pipeline
let library = device.makeDefaultLibrary()
pipelineDescriptor.vertexFunction = library?.makeFunction(name: "vertexShader")
pipelineDescriptor.fragmentFunction = library?.makeFunction(name: "fragmentShader")
// Setup the output pixel format to match the pixel format of the metal kit view
pipelineDescriptor.colorAttachments[0].pixelFormat = MTLPixelFormat.rgba8Unorm;
pipelineDescriptor.sampleCount = 4;
// Compile the configured pipeline descriptor to a pipeline state object
return try device.makeRenderPipelineState(descriptor: pipelineDescriptor)
}
// mtkView will automatically call this function
// whenever it wants new content to be rendered.
func draw(in view: MTKView) {
guard let commandBuffer = commandQueue.makeCommandBuffer() else { return }
let renderPassDescriptor = MTLRenderPassDescriptor(); // view.currentRenderPassDescriptor else { return }
renderPassDescriptor.colorAttachments[0].texture = texture;
renderPassDescriptor.colorAttachments[0].clearColor = MTLClearColorMake(1, 1, 1, 1)
renderPassDescriptor.colorAttachments[0].resolveTexture = resolveTexture;
renderPassDescriptor.colorAttachments[0].storeAction = .storeAndMultisampleResolve
guard let renderEncoder = commandBuffer.makeRenderCommandEncoder(descriptor: renderPassDescriptor) else { return }
renderEncoder.setRenderPipelineState(pipelineState)
renderEncoder.setVertexBuffer(vertexBuffer, offset: 0, index: 0)
renderEncoder.drawPrimitives(type: .triangle, vertexStart: 0, vertexCount: 6)
renderEncoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
let pixelCount = width * height
let region = MTLRegionMake2D(0, 0, width, height)
var pixels = Array<UInt8>(repeating: UInt8(0), count: pixelCount * 4)
resolveTexture.getBytes(
&pixels,
bytesPerRow: width * 4,
from: region,
mipmapLevel: 0);
print("dest size: width: \(width), height: \(height)")
print("Top Left : \(String(format:"%02X", pixels[0])), \(String(format:"%02X", pixels[1])), \(String(format:"%02X", pixels[2])), \(String(format:"%02X", pixels[3])), expected: (0x80, 0x99, 0xB2, 0xCC)")
let offset = width * height * 4 - 4;
print("Bottom Right: \(String(format:"%02X", pixels[offset])), \(String(format:"%02X", pixels[offset + 1])), \(String(format:"%02X", pixels[offset + 2])), \(String(format:"%02X", pixels[offset + 3])), expected: (0x80, 0x99, 0xB2, 0xCC)")
exit(0)
}
// mtkView will automatically call this function
// whenever the size of the view changes (such as resizing the window).
func mtkView(_ view: MTKView, drawableSizeWillChange size: CGSize) {
}
}
Shader
#include <metal_stdlib>
#include "ShaderDefinitions.h"
using namespace metal;
struct VertexOut {
float4 pos [[position]];
};
vertex VertexOut vertexShader(const device Vertex *vertexArray [[buffer(0)]], unsigned int vid [[vertex_id]])
{
Vertex in = vertexArray[vid];
VertexOut out;
out.pos = float4(in.pos.xy, 0, 1);
return out;
}
fragment float4 fragmentShader()
{
return float4(0.5, 0.6, 0.7, 0.8);
}
ShaderDefinitions.h
#ifndef ShaderDefinitions_h
#define ShaderDefinitions_h
#include <simd/simd.h>
struct Vertex {
vector_float2 pos;
};
#endif /* ShaderDefinitions_h */
The output I expect is:
Top Left : 80, 99, B2, CC, expected: (0x80, 0x99, 0xB2, 0xCC)
Bottom Right: 80, 99, B2, CC, expected: (0x80, 0x99, 0xB2, 0xCC)
Which is what I get on Intel and M1 but on AMD and NVidia I get
Top Left : 00, 00, 00, 00, expected: (0x80, 0x99, 0xB2, 0xCC)
Bottom Right: 00, 00, 00, 00, expected: (0x80, 0x99, 0xB2, 0xCC)
[Intel, Apple M1] - unified memory model
[Nvidia, AMD] - discrete memory model
Understand the Private Mode
A resource with a MTLStorageModePrivate mode is accessible only to the
GPU. In a unified memory model, this resource resides in system
memory. In a discrete memory model, it resides in video memory.
Use this implementation to copy texture data from a Private Texture to a Shared Buffer.
The issue was I needed to call synchronize in order to make the data available to the GPU
guard let blitEncoder = commandBuffer.makeBlitCommandEncoder() else { return }
blitEncoder.synchronize(texture: resolveTexture, slice: 0, level: 0);
blitEncoder.endEncoding();
Inserting that code before commandBuffer.commit() in the code from the question solved the issue
I'm trying some very simple algorithm using metal GPU acceleration to calculate some values in an array. The shader throws an error under some conditions I will explain.
Error: Execution of the command buffer was aborted due to an error during execution. Ignored (for causing prior/excessive GPU errors) (IOAF code 4)
The shader only throws this error when adding a value to the existing value at an index of the array. Example:
This will not cause an error:
kernel void shader (device int *wPointsIntensity [[buffer(0)]],
const device uint *wPointsXCoord [[buffer(1)]],
const device uint *wPointsYCoord [[buffer(2)]],
device float *pixelSignalIntensity [[buffer(3)]],
device float *pixelDistance [[buffer(4)]],
const device uint& noOfPoints [[ buffer(5) ]],
const device uint& width [[ buffer(6) ]],
const device uint& height [[ buffer(7) ]],
uint id [[ thread_position_in_grid ]]) {
//this does not throw error
for (uint wpIndex = 0; wpIndex < noOfPoints; wpIndex++) {
for (uint heightIndex = 0; heightIndex < height; heightIndex++) {
for (uint widthIndex = 0; widthIndex < width; widthIndex++) {
uint pixelIndex = heightIndex * width + widthIndex;
pixelDistance[pixelIndex] = float(pixelIndex);
pixelSignalIntensity[pixelIndex] = float(pixelIndex);
}}}}
While if you change
pixelDistance[pixelIndex] = float(pixelIndex);
with
pixelDistance[pixelIndex] += float(pixelIndex);
It will throw an error.
Here is the swift code:
var wPointsValues = [Int32](repeating:0, count: wPoints.count)
var wPointsXLocations = [Int32](repeating:0, count: wPoints.count)
var wPointsYLocations = [Int32](repeating:0, count: wPoints.count)
for i in 0..<wPoints.count {
wPointsValues[i] = Int32(wPoints[i].signalIntensity)
wPointsXLocations[i] = Int32(wPoints[i].location.x)
wPointsYLocations[i] = Int32(wPoints[i].location.y)
}
var numberOfWPoints:Int32 = Int32(wPoints.count)
var int32Width = Int32(width)
var int32Height = Int32(height)
//output arrays
let numberOfResults = wPoints.count * Int(width) * Int(height)
var wPointsSignalIntensity = [Float32](repeating:0.0, count: numberOfResults)
var wPointsDistance = [Float32](repeating:0.0, count: numberOfResults)
//local variables
var signalDensity:[Float32] = [Float32](repeating:0.0, count: numberOfResults)
var signalDistance:[Float32] = [Float32](repeating:0.0, count: numberOfResults)
//create input buffers
let inWPointSignalValues = device.makeBuffer(bytes: wPointsValues, length: (MemoryLayout<Int32>.stride * wPoints.count), options: [])
let inWPointXCoordBuffer = device.makeBuffer(bytes: wPointsXLocations, length: (MemoryLayout<Int32>.stride * wPoints.count), options: [])
let inWPointYCoordBuffer = device.makeBuffer(bytes: wPointsYLocations, length: (MemoryLayout<Int32>.stride * wPoints.count), options: [])
//create putput buffers
let outPixelSignalIntensityBuffer = device.makeBuffer(bytes: wPointsSignalIntensity, length: (MemoryLayout<Float32>.stride * numberOfResults), options: [])
let outPixelDistanceBuffer = device.makeBuffer(bytes: wPointsDistance, length: (MemoryLayout<Float32>.stride * numberOfResults), options: [])
let commandBuffer = (mtlCommmandQueue?.makeCommandBuffer())!
let computeCommandEncoder = (commandBuffer.makeComputeCommandEncoder())!
computeCommandEncoder.setComputePipelineState(mtlComputePipelineFilter!)
//set input buffers
computeCommandEncoder.setBuffer(inWPointSignalValues, offset: 0, index: 0)
computeCommandEncoder.setBuffer(inWPointXCoordBuffer, offset: 0, index: 1)
computeCommandEncoder.setBuffer(inWPointYCoordBuffer, offset: 0, index: 2)
//set output buffers
computeCommandEncoder.setBuffer(outPixelSignalIntensityBuffer, offset: 0, index: 3)
computeCommandEncoder.setBuffer(outPixelDistanceBuffer, offset: 0, index: 4)
//set constants
computeCommandEncoder.setBytes(&numberOfWPoints, length: MemoryLayout<Int32>.stride, index: 5)
computeCommandEncoder.setBytes(&int32Width, length: MemoryLayout<Int32>.stride, index: 6)
computeCommandEncoder.setBytes(&int32Height, length: MemoryLayout<Int32>.stride, index: 7)
let threadsPerGroup = MTLSize(width:2,height:2,depth:2)
let numThreadgroups = MTLSize(width:2, height:2, depth:2)
computeCommandEncoder.dispatchThreadgroups(numThreadgroups, threadsPerThreadgroup: threadsPerGroup)
let endBufferAllocation = mach_absolute_time()
print("time for creating and setting buffert: time: \(Double(endBufferAllocation - start) / Double(NSEC_PER_SEC))")
computeCommandEncoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
let allComplete = mach_absolute_time()
self.signalDistance = (outPixelDistanceBuffer?.contents())!
self.signalDensity = (outPixelSignalIntensityBuffer?.contents())!
I had this issue for ages and program crashed intermittently. It turned out that I was accessing memory in the kernel that had not been allocated by the buffer. In the kernel I was doing a for loop 0..<5 (i.e. output 5 values for each thread) but had not divided the num_threads by 5.
When it didn't crash it was giving the correct answer and no errors except " Execution of the command buffer was aborted due to an error during execution. Caused GPU Hang Error (IOAF code 3)" were ever thrown.
Based on #Kametrixom answer, I have made some test application for parallel calculation of sum in an array.
My test application looks like this:
import UIKit
import Metal
class ViewController: UIViewController {
// Data type, has to be the same as in the shader
typealias DataType = CInt
override func viewDidLoad() {
super.viewDidLoad()
let data = (0..<10000000).map{ _ in DataType(200) } // Our data, randomly generated
var start, end : UInt64
var result:DataType = 0
start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
for elem in buffer {
result += elem
}
}
end = mach_absolute_time()
print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0
start = mach_absolute_time()
result = sumParallel4(data)
end = mach_absolute_time()
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0
start = mach_absolute_time()
result = sumParralel(data)
end = mach_absolute_time()
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0
start = mach_absolute_time()
result = sumParallel3(data)
end = mach_absolute_time()
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
}
func sumParralel(data : Array<DataType>) -> DataType {
let count = data.count
let elementsPerSum: Int = Int(sqrt(Double(count)))
let device = MTLCreateSystemDefaultDevice()!
let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
let pipeline = try! device.newComputePipelineStateWithFunction(parsum)
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)
let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later
let queue = device.newCommandQueue()
let cmds = queue.commandBuffer()
let encoder = cmds.computeCommandEncoder()
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)
// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)
// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()
var result : DataType = 0
cmds.commit()
cmds.waitUntilCompleted()
for elem in results {
result += elem
}
return result
}
func sumParralel1(data : Array<DataType>) -> UnsafeBufferPointer<DataType> {
let count = data.count
let elementsPerSum: Int = Int(sqrt(Double(count)))
let device = MTLCreateSystemDefaultDevice()!
let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
let pipeline = try! device.newComputePipelineStateWithFunction(parsum)
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)
let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later
let queue = device.newCommandQueue()
let cmds = queue.commandBuffer()
let encoder = cmds.computeCommandEncoder()
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)
// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)
// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()
cmds.commit()
cmds.waitUntilCompleted()
return results
}
func sumParallel3(data : Array<DataType>) -> DataType {
var results = sumParralel1(data)
repeat {
results = sumParralel1(Array(results))
} while results.count >= 100
var result : DataType = 0
for elem in results {
result += elem
}
return result
}
func sumParallel4(data : Array<DataType>) -> DataType {
let queue = NSOperationQueue()
queue.maxConcurrentOperationCount = 4
var a0 : DataType = 0
var a1 : DataType = 0
var a2 : DataType = 0
var a3 : DataType = 0
let op0 = NSBlockOperation( block : {
for i in 0..<(data.count/4) {
a0 = a0 + data[i]
}
})
let op1 = NSBlockOperation( block : {
for i in (data.count/4)..<(data.count/2) {
a1 = a1 + data[i]
}
})
let op2 = NSBlockOperation( block : {
for i in (data.count/2)..<(3 * data.count/4) {
a2 = a2 + data[i]
}
})
let op3 = NSBlockOperation( block : {
for i in (3 * data.count/4)..<(data.count) {
a3 = a3 + data[i]
}
})
queue.addOperation(op0)
queue.addOperation(op1)
queue.addOperation(op2)
queue.addOperation(op3)
queue.suspended = false
queue.waitUntilAllOperationsAreFinished()
let aaa: DataType = a0 + a1 + a2 + a3
return aaa
}
}
And I have a shader that looks like this:
kernel void parsum(const device DataType* data [[ buffer(0) ]],
const device uint& dataLength [[ buffer(1) ]],
device DataType* sums [[ buffer(2) ]],
const device uint& elementsPerSum [[ buffer(3) ]],
const uint tgPos [[ threadgroup_position_in_grid ]],
const uint tPerTg [[ threads_per_threadgroup ]],
const uint tPos [[ thread_position_in_threadgroup ]]) {
uint resultIndex = tgPos * tPerTg + tPos; // This is the index of the individual result, this var is unique to this thread
uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end
for (; dataIndex < endIndex; dataIndex++)
sums[resultIndex] += data[dataIndex];
}
On my surprise function sumParallel4 is the fastest, which I thought it shouldn't be. I noticed that when I call functions sumParralel and sumParallel3, the first function is always slower even if I change the order of function. (So if I call sumParralel first this is slower, if I call sumParallel3 this is slower.).
Why is this? Why is sumParallel3 not a lot faster than sumParallel ? Why is sumParallel4 the fastest, although it is calculated on CPU?
How can I update my GPU function with posix_memalign ? I know it should work faster because it would have shared memory between GPU and CPU, but I don't know witch array should be allocated this way (data or result) and how can I allocate data with posix_memalign if data is parameter passed in function?
In running these tests on an iPhone 6, I saw the Metal version run between 3x slower and 2x faster than the naive CPU summation. With the modifications I describe below, it was consistently faster.
I found that a lot of the cost in running the Metal version could be attributed not merely to the allocation of the buffers, though that was significant, but also to the first-time creation of the device and compute pipeline state. These are actions you'd normally perform once at application initialization, so it's not entirely fair to include them in the timing.
It should also be noted that if you're running these tests through Xcode with the Metal validation layer and GPU frame capture enabled, that has a significant run-time cost and will skew the results in the CPU's favor.
With those caveats, here's how you might use posix_memalign to allocate memory that can be used to back a MTLBuffer. The trick is to ensure that the memory you request is in fact page-aligned (i.e. its address is a multiple of getpagesize()), which may entail rounding up the amount of memory beyond how much you actually need to store your data:
let dataCount = 1_000_000
let dataSize = dataCount * strideof(DataType)
let pageSize = Int(getpagesize())
let pageCount = (dataSize + (pageSize - 1)) / pageSize
var dataPointer: UnsafeMutablePointer<Void> = nil
posix_memalign(&dataPointer, pageSize, pageCount * pageSize)
let data = UnsafeMutableBufferPointer(start: UnsafeMutablePointer<DataType>(dataPointer),
count: (pageCount * pageSize) / strideof(DataType))
for i in 0..<dataCount {
data[i] = 200
}
This does require making data an UnsafeMutableBufferPointer<DataType>, rather than an [DataType], since Swift's Array allocates its own backing store. You'll also need to pass along the count of data items to operate on, since the count of the mutable buffer pointer has been rounded up to make the buffer page-aligned.
To actually create a MTLBuffer backed with this data, use the newBufferWithBytesNoCopy(_:length:options:deallocator:) API. It's crucial that, once again, the length you provide is a multiple of the page size; otherwise this method returns nil:
let roundedUpDataSize = strideof(DataType) * data.count
let dataBuffer = device.newBufferWithBytesNoCopy(data.baseAddress, length: roundedUpDataSize, options: [], deallocator: nil)
Here, we don't provide a deallocator, but you should free the memory when you're done using it, by passing the baseAddress of the buffer pointer to free().
I am trying to add a wav header on top of raw PCM data to make it playable via AVAudioPlayer. But i couldn't find any solution or source code to do that on iOS using Objective-C/Swift. Though i found this but it doesn't have correct answer.
But i found a piece of code here which is in C and also contains some issue. The wav file doesn't play properly which is generated from that code.
I have given my codes below which i have coded so far.
int NumChannels = AUDIO_CHANNELS_PER_FRAME;
short BitsPerSample = AUDIO_BITS_PER_CHANNEL;
int SamplingRate = AUDIO_SAMPLE_RATE;
int numOfSamples = [[NSData dataWithContentsOfFile:filePath] length];
int ByteRate = NumChannels*BitsPerSample*SamplingRate/8;
short BlockAlign = NumChannels*BitsPerSample/8;
int DataSize = NumChannels*numOfSamples*BitsPerSample/8;
int chunkSize = 16;
int totalSize = 36 + DataSize;
short audioFormat = 1;
if((fout = fopen([wavFilePath cStringUsingEncoding:1], "w")) == NULL)
{
printf("Error opening out file ");
}
fwrite("RIFF", sizeof(char), 4,fout);
fwrite(&totalSize, sizeof(int), 1, fout);
fwrite("WAVE", sizeof(char), 4, fout);
fwrite("fmt ", sizeof(char), 3, fout);
fwrite(&chunkSize, sizeof(int),1,fout);
fwrite(&audioFormat, sizeof(short), 1, fout);
fwrite(&NumChannels, sizeof(short),1,fout);
fwrite(&SamplingRate, sizeof(int), 1, fout);
fwrite(&ByteRate, sizeof(int), 1, fout);
fwrite(&BlockAlign, sizeof(short), 1, fout);
fwrite(&BitsPerSample, sizeof(short), 1, fout);
fwrite("data", sizeof(char), 3, fout);
fwrite(&DataSize, sizeof(int), 1, fout);
The file is playing too fast, the sound is distorted and only first 10 to 20(around) seconds are playing. I think, the wav header isn't generating correctly(Because i am able to play same PCM data/buffer using AudioUnit/AudioQueue). So what i am missing in my code ? Any help would be highly appreciated.
Thanks in advance.
OK, I am answering my own question if it helps someone else. After few days of tireless trying, at last i have got it working. Below is a complete Function written with Objective-C and C. It takes a file path as a parameter which contains RAW PCM data directly captured from microphone and returns a file path which contains PCM data followed by appropriate wav header info. Then you can play that file with AVAudioPlayer or AVPlayer. Here is the code...
- (NSURL *) getAndCreatePlayableFileFromPcmData:(NSString *)filePath
{
NSString *wavFileName = [[filePath lastPathComponent] stringByDeletingPathExtension];
NSString *wavFileFullName = [NSString stringWithFormat:#"%#.wav",wavFileName];
[self createFileWithName:wavFileFullName];
NSArray *dirPaths = NSSearchPathForDirectoriesInDomains(NSDocumentDirectory, NSUserDomainMask, YES);
NSString *docsDir = [dirPaths objectAtIndex:0];
NSString *wavFilePath = [docsDir stringByAppendingPathComponent:wavFileFullName];
NSLog(#"PCM file path : %#",filePath);
FILE *fout;
short NumChannels = AUDIO_CHANNELS_PER_FRAME;
short BitsPerSample = AUDIO_BITS_PER_CHANNEL;
int SamplingRate = AUDIO_SAMPLE_RATE;
int numOfSamples = [[NSData dataWithContentsOfFile:filePath] length];
int ByteRate = NumChannels*BitsPerSample*SamplingRate/8;
short BlockAlign = NumChannels*BitsPerSample/8;
int DataSize = NumChannels*numOfSamples*BitsPerSample/8;
int chunkSize = 16;
int totalSize = 46 + DataSize;
short audioFormat = 1;
if((fout = fopen([wavFilePath cStringUsingEncoding:1], "w")) == NULL)
{
printf("Error opening out file ");
}
fwrite("RIFF", sizeof(char), 4,fout);
fwrite(&totalSize, sizeof(int), 1, fout);
fwrite("WAVE", sizeof(char), 4, fout);
fwrite("fmt ", sizeof(char), 4, fout);
fwrite(&chunkSize, sizeof(int),1,fout);
fwrite(&audioFormat, sizeof(short), 1, fout);
fwrite(&NumChannels, sizeof(short),1,fout);
fwrite(&SamplingRate, sizeof(int), 1, fout);
fwrite(&ByteRate, sizeof(int), 1, fout);
fwrite(&BlockAlign, sizeof(short), 1, fout);
fwrite(&BitsPerSample, sizeof(short), 1, fout);
fwrite("data", sizeof(char), 4, fout);
fwrite(&DataSize, sizeof(int), 1, fout);
fclose(fout);
NSMutableData *pamdata = [NSMutableData dataWithContentsOfFile:filePath];
NSFileHandle *handle;
handle = [NSFileHandle fileHandleForUpdatingAtPath:wavFilePath];
[handle seekToEndOfFile];
[handle writeData:pamdata];
[handle closeFile];
return [NSURL URLWithString:wavFilePath];
}
But that function only works with the following audio settings.
// Audio settings.
#define AUDIO_SAMPLE_RATE 8000
#define AUDIO_FRAMES_PER_PACKET 1
#define AUDIO_CHANNELS_PER_FRAME 1
#define AUDIO_BITS_PER_CHANNEL 16
#define AUDIO_BYTES_PER_PACKET 2
#define AUDIO_BYTES_PER_FRAME 2
Very helpful question and answer, thank you very much.
This swift version is for those in need:
static func createWAV(from pcmFilePath: String, to wavFilePath: String) -> Bool {
// Make sure that the path does not contain non-ascii characters
guard let fout = fopen(wavFilePath.cString(using: .ascii), "w") else { return false }
guard let pcmData = try? Data(contentsOf: URL(fileURLWithPath: pcmFilePath)) else { return false }
var numChannels: CShort = 1
let numChannelsInt: CInt = 1
var bitsPerSample: CShort = 16
let bitsPerSampleInt: CInt = 16
var samplingRate: CInt = 16000
let numOfSamples = CInt(pcmData.count)
var byteRate = numChannelsInt * bitsPerSampleInt * samplingRate / 8
var blockAlign = numChannelsInt * bitsPerSampleInt / 8
var dataSize = numChannelsInt * numOfSamples * bitsPerSampleInt / 8
var chunkSize: CInt = 16
var totalSize = 46 + dataSize
var audioFormat: CShort = 1
fwrite("RIFF".cString(using: .ascii), MemoryLayout<CChar>.size, 4, fout)
fwrite(&totalSize, MemoryLayout<CInt>.size, 1, fout)
fwrite("WAVE".cString(using: .ascii), MemoryLayout<CChar>.size, 4, fout);
fwrite("fmt ".cString(using: .ascii), MemoryLayout<CChar>.size, 4, fout);
fwrite(&chunkSize, MemoryLayout<CInt>.size,1,fout);
fwrite(&audioFormat, MemoryLayout<CShort>.size, 1, fout);
fwrite(&numChannels, MemoryLayout<CShort>.size,1,fout);
fwrite(&samplingRate, MemoryLayout<CInt>.size, 1, fout);
fwrite(&byteRate, MemoryLayout<CInt>.size, 1, fout);
fwrite(&blockAlign, MemoryLayout<CShort>.size, 1, fout);
fwrite(&bitsPerSample, MemoryLayout<CShort>.size, 1, fout);
fwrite("data".cString(using: .ascii), MemoryLayout<CChar>.size, 4, fout);
fwrite(&dataSize, MemoryLayout<CInt>.size, 1, fout);
fclose(fout);
guard let handle = FileHandle(forUpdatingAtPath: wavFilePath) else { return false }
handle.seekToEndOfFile()
handle.write(pcmData)
handle.closeFile()
return true
}
Modified from qiz's answer for swift 5
func extractSubchunks(data:Data) -> RiffFile? {
var data = data
var chunks = [SubChunk]()
let position = data.subdata(in: 8..<12)
let filelengthBytes = data.subdata(in: 4..<8).map { UInt32($0) }
let filelength: UInt32 = filelengthBytes[0] << 24 + filelengthBytes[1] << 16 + filelengthBytes[2] << 8 + filelengthBytes[3]
let wave = String(bytes: position, encoding: .utf8) ?? "NoName"
guard wave == "WAVE" else {
print("File is \(wave) not WAVE")
return nil
}
data.removeSubrange(0..<12)
print("Found chunks")
while data.count != 0{
let position = data.subdata(in: 0..<4)
let lengthBytes = data.subdata(in: 4..<8).map { UInt32($0) }
let length: UInt32 = lengthBytes[0] << 24 + lengthBytes[1] << 16 + lengthBytes[2] << 8 + lengthBytes[3]
guard let current = String(bytes: position, encoding: .utf8) else{
return nil
}
data.removeSubrange(0..<8)
let chunkData = data.subdata(in: 0..<Int(length))
data.removeSubrange(0..<Int(length))
let subchunk = SubChunk(name: current, size: Int(length), data: chunkData)
chunks.append(subchunk)
print(subchunk.debugDescription)
}
let riff = RiffFile(size: Int(filelength), subChunks: chunks)
return riff
}
This is a Data extension for Swift that returns another Data, made using the answer from qiz.
extension Data {
func wavValue: Data? {
var numChannels: CShort = 1
let numChannelsInt: CInt = 1
var bitsPerSample: CShort = 16
let bitsPerSampleInt: CInt = 16
var samplingRate: CInt = 44100
let numOfSamples = CInt(pcmData.count)
var byteRate = numChannelsInt * bitsPerSampleInt * samplingRate / 8
var blockAlign = numChannelsInt * bitsPerSampleInt / 8
var dataSize = numChannelsInt * numOfSamples * bitsPerSampleInt / 8
var chunkSize: CInt = 16
var totalSize = 46 + dataSize
var audioFormat: CShort = 1
let wavNSData = NSMutableData()
wavNSData.append("RIFF".cString(using: .ascii) ?? .init(), length: MemoryLayout<CChar>.size * 4)
wavNSData.append(&totalSize, length: MemoryLayout<CInt>.size)
wavNSData.append("WAVE".cString(using: .ascii) ?? .init(), length: MemoryLayout<CChar>.size * 4)
wavNSData.append("fmt ".cString(using: .ascii) ?? .init(), length: MemoryLayout<CChar>.size * 4)
wavNSData.append(&chunkSize, length: MemoryLayout<CInt>.size)
wavNSData.append(&audioFormat, length: MemoryLayout<CShort>.size)
wavNSData.append(&numChannels, length: MemoryLayout<CShort>.size)
wavNSData.append(&samplingRate, length: MemoryLayout<CInt>.size)
wavNSData.append(&byteRate, length: MemoryLayout<CInt>.size)
wavNSData.append(&blockAlign, length: MemoryLayout<CShort>.size)
wavNSData.append(&bitsPerSample, length: MemoryLayout<CShort>.size)
wavNSData.append("data".cString(using: .ascii) ?? .init(), length: MemoryLayout<CChar>.size * 4)
wavNSData.append(&dataSize, length: MemoryLayout<CInt>.size)
wavNSData.append(self)
let wavData = Data(referencing: wavNSData)
return wavData
}
}
I am trying to get the CIColorCube filter working. However the Apple documents only provide a poorly explained reference example here:
// Allocate memory
const unsigned int size = 64;
float *cubeData = (float *)malloc (size * size * size * sizeof (float) * 4);
float rgb[3], hsv[3], *c = cubeData;
// Populate cube with a simple gradient going from 0 to 1
for (int z = 0; z < size; z++){
rgb[2] = ((double)z)/(size-1); // Blue value
for (int y = 0; y < size; y++){
rgb[1] = ((double)y)/(size-1); // Green value
for (int x = 0; x < size; x ++){
rgb[0] = ((double)x)/(size-1); // Red value
// Convert RGB to HSV
// You can find publicly available rgbToHSV functions on the Internet
rgbToHSV(rgb, hsv);
// Use the hue value to determine which to make transparent
// The minimum and maximum hue angle depends on
// the color you want to remove
float alpha = (hsv[0] > minHueAngle && hsv[0] < maxHueAngle) ? 0.0f: 1.0f;
// Calculate premultiplied alpha values for the cube
c[0] = rgb[0] * alpha;
c[1] = rgb[1] * alpha;
c[2] = rgb[2] * alpha;
c[3] = alpha;
c += 4; // advance our pointer into memory for the next color value
}
}
}
// Create memory with the cube data
NSData *data = [NSData dataWithBytesNoCopy:cubeData
length:cubeDataSize
freeWhenDone:YES];
CIColorCube *colorCube = [CIFilter filterWithName:#"CIColorCube"];
[colorCube setValue:#(size) forKey:#"inputCubeDimension"];
// Set data for cube
[colorCube setValue:data forKey:#"inputCubeData"];
So I have attempted to translate this over to Swift with the following:
var filter = CIFilter(name: "CIColorCube")
filter.setValue(ciImage, forKey: kCIInputImageKey)
filter.setDefaults()
var size: UInt = 64
var floatSize = UInt(sizeof(Float))
var cubeDataSize:size_t = size * size * size * floatSize * 4
var colorCubeData:Array<Float> = [
0,0,0,1,
0,0,0,1,
0,0,0,1,
0,0,0,1,
0,0,0,1,
0,0,0,1,
0,0,0,1,
0,0,0,1
]
var cubeData:NSData = NSData(bytesNoCopy: colorCubeData, length: cubeDataSize)
However I get an error when trying to create the cube data:
"Extra argument 'bytesNoCopy' in call"
Basically I am creating the cubeData wrong. Can you advise me on how to properly create the cubeData object in Swift?
Thanks!
Looks like you are after the chroma key filter recipe described here. Here's some code that works. You get a filter for the color you want to make transparent, described by its HSV angle:
func RGBtoHSV(r : Float, g : Float, b : Float) -> (h : Float, s : Float, v : Float) {
var h : CGFloat = 0
var s : CGFloat = 0
var v : CGFloat = 0
let col = UIColor(red: CGFloat(r), green: CGFloat(g), blue: CGFloat(b), alpha: 1.0)
col.getHue(&h, saturation: &s, brightness: &v, alpha: nil)
return (Float(h), Float(s), Float(v))
}
func colorCubeFilterForChromaKey(hueAngle: Float) -> CIFilter {
let hueRange: Float = 60 // degrees size pie shape that we want to replace
let minHueAngle: Float = (hueAngle - hueRange/2.0) / 360
let maxHueAngle: Float = (hueAngle + hueRange/2.0) / 360
let size = 64
var cubeData = [Float](repeating: 0, count: size * size * size * 4)
var rgb: [Float] = [0, 0, 0]
var hsv: (h : Float, s : Float, v : Float)
var offset = 0
for z in 0 ..< size {
rgb[2] = Float(z) / Float(size) // blue value
for y in 0 ..< size {
rgb[1] = Float(y) / Float(size) // green value
for x in 0 ..< size {
rgb[0] = Float(x) / Float(size) // red value
hsv = RGBtoHSV(r: rgb[0], g: rgb[1], b: rgb[2])
// the condition checking hsv.s may need to be removed for your use-case
let alpha: Float = (hsv.h > minHueAngle && hsv.h < maxHueAngle && hsv.s > 0.5) ? 0 : 1.0
cubeData[offset] = rgb[0] * alpha
cubeData[offset + 1] = rgb[1] * alpha
cubeData[offset + 2] = rgb[2] * alpha
cubeData[offset + 3] = alpha
offset += 4
}
}
}
let b = cubeData.withUnsafeBufferPointer { Data(buffer: $0) }
let data = b as NSData
let colorCube = CIFilter(name: "CIColorCube", withInputParameters: [
"inputCubeDimension": size,
"inputCubeData": data
])
return colorCube!
}
Then to get your filter call
let chromaKeyFilter = colorCubeFilterForChromaKey(hueAngle: 120)
I used 120 for your standard green screen.
I believe you want to use NSData(bytes: UnsafePointer<Void>, length: Int) instead of NSData(bytesNoCopy: UnsafeMutablePointer<Void>, length: Int). Make that change and calculate the length in the following way and you should be up and running.
let colorCubeData: [Float] = [
0, 0, 0, 1,
1, 0, 0, 1,
0, 1, 0, 1,
1, 1, 0, 1,
0, 0, 1, 1,
1, 0, 1, 1,
0, 1, 1, 1,
1, 1, 1, 1
]
let cubeData = NSData(bytes: colorCubeData, length: colorCubeData.count * sizeof(Float))