Swift metal parallel sum calculation of array on iOS - ios

Based on #Kametrixom answer, I have made some test application for parallel calculation of sum in an array.
My test application looks like this:
import UIKit
import Metal
class ViewController: UIViewController {
// Data type, has to be the same as in the shader
typealias DataType = CInt
override func viewDidLoad() {
super.viewDidLoad()
let data = (0..<10000000).map{ _ in DataType(200) } // Our data, randomly generated
var start, end : UInt64
var result:DataType = 0
start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
for elem in buffer {
result += elem
}
}
end = mach_absolute_time()
print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0
start = mach_absolute_time()
result = sumParallel4(data)
end = mach_absolute_time()
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0
start = mach_absolute_time()
result = sumParralel(data)
end = mach_absolute_time()
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0
start = mach_absolute_time()
result = sumParallel3(data)
end = mach_absolute_time()
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
}
func sumParralel(data : Array<DataType>) -> DataType {
let count = data.count
let elementsPerSum: Int = Int(sqrt(Double(count)))
let device = MTLCreateSystemDefaultDevice()!
let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
let pipeline = try! device.newComputePipelineStateWithFunction(parsum)
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)
let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later
let queue = device.newCommandQueue()
let cmds = queue.commandBuffer()
let encoder = cmds.computeCommandEncoder()
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)
// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)
// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()
var result : DataType = 0
cmds.commit()
cmds.waitUntilCompleted()
for elem in results {
result += elem
}
return result
}
func sumParralel1(data : Array<DataType>) -> UnsafeBufferPointer<DataType> {
let count = data.count
let elementsPerSum: Int = Int(sqrt(Double(count)))
let device = MTLCreateSystemDefaultDevice()!
let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
let pipeline = try! device.newComputePipelineStateWithFunction(parsum)
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)
let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later
let queue = device.newCommandQueue()
let cmds = queue.commandBuffer()
let encoder = cmds.computeCommandEncoder()
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)
// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)
// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()
cmds.commit()
cmds.waitUntilCompleted()
return results
}
func sumParallel3(data : Array<DataType>) -> DataType {
var results = sumParralel1(data)
repeat {
results = sumParralel1(Array(results))
} while results.count >= 100
var result : DataType = 0
for elem in results {
result += elem
}
return result
}
func sumParallel4(data : Array<DataType>) -> DataType {
let queue = NSOperationQueue()
queue.maxConcurrentOperationCount = 4
var a0 : DataType = 0
var a1 : DataType = 0
var a2 : DataType = 0
var a3 : DataType = 0
let op0 = NSBlockOperation( block : {
for i in 0..<(data.count/4) {
a0 = a0 + data[i]
}
})
let op1 = NSBlockOperation( block : {
for i in (data.count/4)..<(data.count/2) {
a1 = a1 + data[i]
}
})
let op2 = NSBlockOperation( block : {
for i in (data.count/2)..<(3 * data.count/4) {
a2 = a2 + data[i]
}
})
let op3 = NSBlockOperation( block : {
for i in (3 * data.count/4)..<(data.count) {
a3 = a3 + data[i]
}
})
queue.addOperation(op0)
queue.addOperation(op1)
queue.addOperation(op2)
queue.addOperation(op3)
queue.suspended = false
queue.waitUntilAllOperationsAreFinished()
let aaa: DataType = a0 + a1 + a2 + a3
return aaa
}
}
And I have a shader that looks like this:
kernel void parsum(const device DataType* data [[ buffer(0) ]],
const device uint& dataLength [[ buffer(1) ]],
device DataType* sums [[ buffer(2) ]],
const device uint& elementsPerSum [[ buffer(3) ]],
const uint tgPos [[ threadgroup_position_in_grid ]],
const uint tPerTg [[ threads_per_threadgroup ]],
const uint tPos [[ thread_position_in_threadgroup ]]) {
uint resultIndex = tgPos * tPerTg + tPos; // This is the index of the individual result, this var is unique to this thread
uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end
for (; dataIndex < endIndex; dataIndex++)
sums[resultIndex] += data[dataIndex];
}
On my surprise function sumParallel4 is the fastest, which I thought it shouldn't be. I noticed that when I call functions sumParralel and sumParallel3, the first function is always slower even if I change the order of function. (So if I call sumParralel first this is slower, if I call sumParallel3 this is slower.).
Why is this? Why is sumParallel3 not a lot faster than sumParallel ? Why is sumParallel4 the fastest, although it is calculated on CPU?
How can I update my GPU function with posix_memalign ? I know it should work faster because it would have shared memory between GPU and CPU, but I don't know witch array should be allocated this way (data or result) and how can I allocate data with posix_memalign if data is parameter passed in function?

In running these tests on an iPhone 6, I saw the Metal version run between 3x slower and 2x faster than the naive CPU summation. With the modifications I describe below, it was consistently faster.
I found that a lot of the cost in running the Metal version could be attributed not merely to the allocation of the buffers, though that was significant, but also to the first-time creation of the device and compute pipeline state. These are actions you'd normally perform once at application initialization, so it's not entirely fair to include them in the timing.
It should also be noted that if you're running these tests through Xcode with the Metal validation layer and GPU frame capture enabled, that has a significant run-time cost and will skew the results in the CPU's favor.
With those caveats, here's how you might use posix_memalign to allocate memory that can be used to back a MTLBuffer. The trick is to ensure that the memory you request is in fact page-aligned (i.e. its address is a multiple of getpagesize()), which may entail rounding up the amount of memory beyond how much you actually need to store your data:
let dataCount = 1_000_000
let dataSize = dataCount * strideof(DataType)
let pageSize = Int(getpagesize())
let pageCount = (dataSize + (pageSize - 1)) / pageSize
var dataPointer: UnsafeMutablePointer<Void> = nil
posix_memalign(&dataPointer, pageSize, pageCount * pageSize)
let data = UnsafeMutableBufferPointer(start: UnsafeMutablePointer<DataType>(dataPointer),
count: (pageCount * pageSize) / strideof(DataType))
for i in 0..<dataCount {
data[i] = 200
}
This does require making data an UnsafeMutableBufferPointer<DataType>, rather than an [DataType], since Swift's Array allocates its own backing store. You'll also need to pass along the count of data items to operate on, since the count of the mutable buffer pointer has been rounded up to make the buffer page-aligned.
To actually create a MTLBuffer backed with this data, use the newBufferWithBytesNoCopy(_:length:options:deallocator:) API. It's crucial that, once again, the length you provide is a multiple of the page size; otherwise this method returns nil:
let roundedUpDataSize = strideof(DataType) * data.count
let dataBuffer = device.newBufferWithBytesNoCopy(data.baseAddress, length: roundedUpDataSize, options: [], deallocator: nil)
Here, we don't provide a deallocator, but you should free the memory when you're done using it, by passing the baseAddress of the buffer pointer to free().

Related

How to generate very sharp color scale for below zero and above zero?

I'm encountering a big problem when using the number 0 (zero) as a factor for the colors to generate scales, the numbers close to 0 (zero) end up becoming almost white, impossible to see a difference.
The idea is that above 0 (zero) it starts green and gets even stronger and below 0 (zero) starting with a red one and getting stronger.
I really need any number, even if it's 0.000001 already has a visible green and the -0.000001 has a visible red.
Link to SpreadSheet:
https://docs.google.com/spreadsheets/d/1uN5rDEeR10m3EFw29vM_nVXGMqhLcNilYrFOQfcC97s/edit?usp=sharing
Note to help with image translation and visualization:
Número = Number
Nenhum = None
Valor Máx. = Max Value
Valor Min. = Min Value
Current Result / Expected Result
After reading your new comments I understand that these are the requisites:
The values above zero should be green (with increased intensity the further beyond zero).
The values below zero should be red (with increased intensity the further beyond zero).
Values near zero should be coloured (not almost white).
Given those requisites, I developed an Apps Script project that would be useful in your scenario. This is the full project:
function onOpen() {
var ui = SpreadsheetApp.getUi();
ui.createMenu("Extra").addItem("Generate gradient", "parseData").addToUi();
}
function parseData() {
var darkestGreen = "#009000";
var lighestGreen = "#B8F4B8";
var darkestRed = "#893F45";
var lighestRed = "#FEBFC4";
var range = SpreadsheetApp.getActiveRange();
var data = range.getValues();
var biggestPositive = Math.max.apply(null, data);
var biggestNegative = Math.min.apply(null, data);
var greenPalette = colourPalette(darkestGreen, lighestGreen, biggestPositive);
var redPalette = colourPalette(darkestRed, lighestRed, Math.abs(
biggestNegative) + 1);
var fullPalette = [];
for (var i = 0; i < data.length; i++) {
if (data[i] > 0) {
var cellColour = [];
cellColour[0] = greenPalette[data[i] - 1];
fullPalette.push(cellColour);
} else if (data[i] < 0) {
var cellColour = [];
cellColour[0] = redPalette[Math.abs(data[i]) - 1];
fullPalette.push(cellColour);
} else if (data[i] == 0) {
var cellColour = [];
cellColour[0] = null;
fullPalette.push(cellColour);
}
}
range.setBackgrounds(fullPalette);
}
function colourPalette(darkestColour, lightestColour, colourSteps) {
var firstColour = hexToRGB(darkestColour);
var lastColour = hexToRGB(lightestColour);
var blending = 0.0;
var gradientColours = [];
for (i = 0; i < colourSteps; i++) {
var colour = [];
blending += (1.0 / colourSteps);
colour[0] = firstColour[0] * blending + (1 - blending) * lastColour[0];
colour[1] = firstColour[1] * blending + (1 - blending) * lastColour[1];
colour[2] = firstColour[2] * blending + (1 - blending) * lastColour[2];
gradientColours.push(rgbToHex(colour));
}
return gradientColours;
}
function hexToRGB(hex) {
var colour = [];
colour[0] = parseInt((removeNumeralSymbol(hex)).substring(0, 2), 16);
colour[1] = parseInt((removeNumeralSymbol(hex)).substring(2, 4), 16);
colour[2] = parseInt((removeNumeralSymbol(hex)).substring(4, 6), 16);
return colour;
}
function removeNumeralSymbol(hex) {
return (hex.charAt(0) == '#') ? hex.substring(1, 7) : hex
}
function rgbToHex(rgb) {
return "#" + hex(rgb[0]) + hex(rgb[1]) + hex(rgb[2]);
}
function hex(c) {
var pool = "0123456789abcdef";
var integer = parseInt(c);
if (integer == 0 || isNaN(c)) {
return "00";
}
integer = Math.round(Math.min(Math.max(0, integer), 255));
return pool.charAt((integer - integer % 16) / 16) + pool.charAt(integer % 16);
}
First of all the script will use the Ui class to show a customised menu called Extra. That menu calls the main function parseData, that reads the whole selection data with getValues. That function holds the darkest/lightest green/red colours. I used some colours for my example, but I advise you to edit them as you wish. Based on those colours, the function colourPalette will use graphical linear interpolation between the two colours (lightest and darkest). That interpolation will return an array with colours from darkest to lightest, with as many in-betweens as the maximum integer in the column. Please notice how the function uses many minimal functions to run repetitive tasks (converting from hexadecimal to RGB, formatting, etc…). When the palette is ready, the main function will create an array with all the used colours (meaning that it will skip unused colours, to give sharp contrast between big and small numbers). Finally, it will apply the palette using the setBackgrounds method. Here you can see some sample results:
In that picture you can see one set of colours per column. Varying between random small and big numbers, numerical series and mixed small/big numbers. Please feel free to ask any doubt about this approach.
A very small improvement to acques-Guzel Heron
I made it skip all non numeric values, beforehand it just errored out.
I added an option in the menu to use a custom range.
Thank you very much acques-Guzel Heron
function onOpen() {
const ui = SpreadsheetApp.getUi();
ui.createMenu('Extra')
.addItem('Generate gradient', 'parseData')
.addItem('Custom Range', 'customRange')
.addToUi();
}
function parseData(customRange = null) {
const darkestGreen = '#009000';
const lighestGreen = '#B8F4B8';
const darkestRed = '#893F45';
const lighestRed = '#FEBFC4';
let range = SpreadsheetApp.getActiveRange();
if (customRange) {
range = SpreadsheetApp.getActiveSpreadsheet().getRange(customRange);
}
const data = range.getValues();
const biggestPositive = Math.max.apply(null, data.filter(a => !isNaN([a])));
const biggestNegative = Math.min.apply(null, data.filter(a => !isNaN([a])));
const greenPalette = colorPalette(darkestGreen, lighestGreen, biggestPositive);
const redPalette = colorPalette(darkestRed, lighestRed, Math.abs(biggestNegative) + 1);
const fullPalette = [];
for (const datum of data) {
if (datum > 0) {
fullPalette.push([greenPalette[datum - 1]]);
} else if (datum < 0) {
fullPalette.push([redPalette[Math.abs(datum) - 1]]);
} else if (datum == 0 || isNaN(datum)) {
fullPalette.push(['#ffffff']);
}
}
range.setBackgrounds(fullPalette);
}
function customRange() {
const ui = SpreadsheetApp.getUi();
result = ui.prompt("Please enter a range");
parseData(result.getResponseText());
}
function colorPalette(darkestColor, lightestColor, colorSteps) {
const firstColor = hexToRGB(darkestColor);
const lastColor = hexToRGB(lightestColor);
let blending = 0;
const gradientColors = [];
for (i = 0; i < colorSteps; i++) {
const color = [];
blending += (1 / colorSteps);
color[0] = firstColor[0] * blending + (1 - blending) * lastColor[0];
color[1] = firstColor[1] * blending + (1 - blending) * lastColor[1];
color[2] = firstColor[2] * blending + (1 - blending) * lastColor[2];
gradientColors.push(rgbToHex(color));
}
return gradientColors;
}
function hexToRGB(hex) {
const color = [];
color[0] = Number.parseInt((removeNumeralSymbol(hex)).slice(0, 2), 16);
color[1] = Number.parseInt((removeNumeralSymbol(hex)).slice(2, 4), 16);
color[2] = Number.parseInt((removeNumeralSymbol(hex)).slice(4, 6), 16);
return color;
}
function removeNumeralSymbol(hex) {
return (hex.charAt(0) == '#') ? hex.slice(1, 7) : hex;
}
function rgbToHex(rgb) {
return '#' + hex(rgb[0]) + hex(rgb[1]) + hex(rgb[2]);
}
function hex(c) {
const pool = '0123456789abcdef';
let integer = Number.parseInt(c, 10);
if (integer === 0 || isNaN(c)) {
return '00';
}
integer = Math.round(Math.min(Math.max(0, integer), 255));
return pool.charAt((integer - integer % 16) / 16) + pool.charAt(integer % 16);
}

Swift 2D Array Performance

I am working on creating an iOS version of an Android app I created. It involves a lot of two-dimensional array access and assignment, and it worked very quickly on Java. However, when I converted to Swift, I noticed a very significant slowdown. After some research on two dimensional Swift arrays, I thought the problem might be coming from the 2D arrays, so I decided to create and time a simple program to test 2D array performance. I compared the execution times of a 2D and 1D array, and there was a significant difference. Below is the program I used to test performance:
import Foundation
var numberOfItems = 1000
var myArray1 = [[Double]](repeating: [Double](repeating: 0.0, count: numberOfItems), count: numberOfItems)
var myArray2 = [[Double]](repeating: [Double](repeating: 0.0, count: numberOfItems), count: numberOfItems)
var myArray3 = [Double](repeating: 0.0, count: numberOfItems * numberOfItems)
var myArray4 = [Double](repeating: 0.0, count: numberOfItems * numberOfItems)
// 2D array assignment
let start1 = CFAbsoluteTimeGetCurrent()
var x = 0.0
for i in 0..<numberOfItems {
for j in 0..<numberOfItems {
myArray1[i][j] = x
x += 1
}
}
let diff1 = CFAbsoluteTimeGetCurrent() - start1
print(diff1 * 1000)
// 2D array access and assignment
let start2 = CFAbsoluteTimeGetCurrent()
for i in 0..<numberOfItems {
for j in 0..<numberOfItems {
myArray2[i][j] = myArray1[i][j]
}
}
let diff2 = CFAbsoluteTimeGetCurrent() - start2
print(diff2 * 1000)
// 1D array assignment
var y = 0.0
let start3 = CFAbsoluteTimeGetCurrent()
for i in 0..<(numberOfItems * numberOfItems) {
myArray3[i] = y
y += 1
}
let diff3 = CFAbsoluteTimeGetCurrent() - start3
print(diff3 * 1000)
// 1D array access and assignment
let start4 = CFAbsoluteTimeGetCurrent()
for i in 0..<(numberOfItems * numberOfItems) {
myArray4[i] = myArray3[i]
}
let diff4 = CFAbsoluteTimeGetCurrent() - start4
print(diff4 * 1000)
I ran it on the command line using the -Ounchecked option. I got the following output (in ms, some variation but usually pretty close):
6.0759782791137695
24.2689847946167
2.4139881134033203
1.5819072723388672
Clearly there is a considerable performance difference between the 2D and 1D array implementations, especially when both accessing and assigning.
Is there a way to create a more efficient 2D array in Swift? Performance is important for me in this instance, so is it better to use a 1D array and do some math for indexing?
If you really want to stick to a 2D array then you can use unsafe buffer pointers for faster access. However, 1D arrays are still going to be more efficient. Give this a shot.
// 2D array assignment
myArray1.withUnsafeMutableBufferPointer { outer1 -> Void in
for i in 0..<numberOfItems {
outer1[i].withUnsafeMutableBufferPointer { inner1 -> Void in
for j in 0..<numberOfItems {
inner1[j] = x
x += 1
}
}
}
}
// 2D array access and assignment
myArray1.withUnsafeMutableBufferPointer { outer1 -> Void in
myArray2.withUnsafeMutableBufferPointer { outer2 -> Void in
for i in 0..<numberOfItems {
outer1[i].withUnsafeMutableBufferPointer { inner1 -> Void in
outer2[i].withUnsafeMutableBufferPointer { inner2 -> Void in
for j in 0..<numberOfItems {
inner2[j] = inner1[j]
}
}
}
}
}
}

Swift Metal Shader Execution of the command buffer was aborted due to an error during execution when adding a number to an array value

I'm trying some very simple algorithm using metal GPU acceleration to calculate some values in an array. The shader throws an error under some conditions I will explain.
Error: Execution of the command buffer was aborted due to an error during execution. Ignored (for causing prior/excessive GPU errors) (IOAF code 4)
The shader only throws this error when adding a value to the existing value at an index of the array. Example:
This will not cause an error:
kernel void shader (device int *wPointsIntensity [[buffer(0)]],
const device uint *wPointsXCoord [[buffer(1)]],
const device uint *wPointsYCoord [[buffer(2)]],
device float *pixelSignalIntensity [[buffer(3)]],
device float *pixelDistance [[buffer(4)]],
const device uint& noOfPoints [[ buffer(5) ]],
const device uint& width [[ buffer(6) ]],
const device uint& height [[ buffer(7) ]],
uint id [[ thread_position_in_grid ]]) {
//this does not throw error
for (uint wpIndex = 0; wpIndex < noOfPoints; wpIndex++) {
for (uint heightIndex = 0; heightIndex < height; heightIndex++) {
for (uint widthIndex = 0; widthIndex < width; widthIndex++) {
uint pixelIndex = heightIndex * width + widthIndex;
pixelDistance[pixelIndex] = float(pixelIndex);
pixelSignalIntensity[pixelIndex] = float(pixelIndex);
}}}}
While if you change
pixelDistance[pixelIndex] = float(pixelIndex);
with
pixelDistance[pixelIndex] += float(pixelIndex);
It will throw an error.
Here is the swift code:
var wPointsValues = [Int32](repeating:0, count: wPoints.count)
var wPointsXLocations = [Int32](repeating:0, count: wPoints.count)
var wPointsYLocations = [Int32](repeating:0, count: wPoints.count)
for i in 0..<wPoints.count {
wPointsValues[i] = Int32(wPoints[i].signalIntensity)
wPointsXLocations[i] = Int32(wPoints[i].location.x)
wPointsYLocations[i] = Int32(wPoints[i].location.y)
}
var numberOfWPoints:Int32 = Int32(wPoints.count)
var int32Width = Int32(width)
var int32Height = Int32(height)
//output arrays
let numberOfResults = wPoints.count * Int(width) * Int(height)
var wPointsSignalIntensity = [Float32](repeating:0.0, count: numberOfResults)
var wPointsDistance = [Float32](repeating:0.0, count: numberOfResults)
//local variables
var signalDensity:[Float32] = [Float32](repeating:0.0, count: numberOfResults)
var signalDistance:[Float32] = [Float32](repeating:0.0, count: numberOfResults)
//create input buffers
let inWPointSignalValues = device.makeBuffer(bytes: wPointsValues, length: (MemoryLayout<Int32>.stride * wPoints.count), options: [])
let inWPointXCoordBuffer = device.makeBuffer(bytes: wPointsXLocations, length: (MemoryLayout<Int32>.stride * wPoints.count), options: [])
let inWPointYCoordBuffer = device.makeBuffer(bytes: wPointsYLocations, length: (MemoryLayout<Int32>.stride * wPoints.count), options: [])
//create putput buffers
let outPixelSignalIntensityBuffer = device.makeBuffer(bytes: wPointsSignalIntensity, length: (MemoryLayout<Float32>.stride * numberOfResults), options: [])
let outPixelDistanceBuffer = device.makeBuffer(bytes: wPointsDistance, length: (MemoryLayout<Float32>.stride * numberOfResults), options: [])
let commandBuffer = (mtlCommmandQueue?.makeCommandBuffer())!
let computeCommandEncoder = (commandBuffer.makeComputeCommandEncoder())!
computeCommandEncoder.setComputePipelineState(mtlComputePipelineFilter!)
//set input buffers
computeCommandEncoder.setBuffer(inWPointSignalValues, offset: 0, index: 0)
computeCommandEncoder.setBuffer(inWPointXCoordBuffer, offset: 0, index: 1)
computeCommandEncoder.setBuffer(inWPointYCoordBuffer, offset: 0, index: 2)
//set output buffers
computeCommandEncoder.setBuffer(outPixelSignalIntensityBuffer, offset: 0, index: 3)
computeCommandEncoder.setBuffer(outPixelDistanceBuffer, offset: 0, index: 4)
//set constants
computeCommandEncoder.setBytes(&numberOfWPoints, length: MemoryLayout<Int32>.stride, index: 5)
computeCommandEncoder.setBytes(&int32Width, length: MemoryLayout<Int32>.stride, index: 6)
computeCommandEncoder.setBytes(&int32Height, length: MemoryLayout<Int32>.stride, index: 7)
let threadsPerGroup = MTLSize(width:2,height:2,depth:2)
let numThreadgroups = MTLSize(width:2, height:2, depth:2)
computeCommandEncoder.dispatchThreadgroups(numThreadgroups, threadsPerThreadgroup: threadsPerGroup)
let endBufferAllocation = mach_absolute_time()
print("time for creating and setting buffert: time: \(Double(endBufferAllocation - start) / Double(NSEC_PER_SEC))")
computeCommandEncoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
let allComplete = mach_absolute_time()
self.signalDistance = (outPixelDistanceBuffer?.contents())!
self.signalDensity = (outPixelSignalIntensityBuffer?.contents())!
I had this issue for ages and program crashed intermittently. It turned out that I was accessing memory in the kernel that had not been allocated by the buffer. In the kernel I was doing a for loop 0..<5 (i.e. output 5 values for each thread) but had not divided the num_threads by 5.
When it didn't crash it was giving the correct answer and no errors except " Execution of the command buffer was aborted due to an error during execution. Caused GPU Hang Error (IOAF code 3)" were ever thrown.

Swift- 'For' Loop not adding variable to total result

I am trying to create a function in Playground using Swift where a calculation is made several times, and then added to the total sum of calculations until the loop is over. Everything seems to be working, except that when I try to sum the every calculation to the last total, it just gives me the value of the calculation. Here is my code:
func Calc(diff: String, hsh: String, sperunit: Float, rate: Float, n: Int16, p: Float, length: Int16) -> Float {
//Divisions per Year
let a: Int16 = length/n
let rem = length - (a*n)
let spl = Calc(diff, hsh: hash, sperunit: sperunit, rate: rate)
for var i = 0; i < Int(a) ; i++ { //also tried for i in i..<a
var result: Float = 0
let h = (spl * Float(n) / pow (p,Float(i))) //This gives me a correct result
result += h //This gives me the same result from h
finalResult = result
}
finalResult = finalResult + (Float(rem) * spl / pow (p,Float(a))) //This line is meant to get the result variable out of the loop and do an extra calculation outside of the loop
print(finalResult)
return finalResult
}
Am I doing something wrong?
Currently your variable result is scoped to the loop and does not exist outside of it. Additionally every run of the loop creates a new result variable, initialized with 0.
What you have to do is move the line var result: Float = 0 in front of the for loop:
var result: Float = 0
for var i = 0; i < Int(a) ; i++ {
let h = (spl * Float(n) / pow (p,Float(i)))
result += h
finalResult = result
}
Additionally you can remove the repeated assignment of finalResult = result and just do it once after the loop is over.
You can probably remove the finalResult completely. Just write
var result: Float = 0
for var i = 0; i < Int(a) ; i++ {
let h = (spl * Float(n) / pow (p,Float(i)))
result += h
}
result += (Float(rem) * spl / pow (p,Float(a)))
print(result)
return result

Apparent indices limit

Using SceneKit in swift I trying to build a custom 3D object (a terrain). To build a terrain I build a plane that I've divided in a number of horizontal and vertical section. With a small number or section everything is fine but with not so large number the app crash in some deep OpenGL function with a EXC_BAD_ACCESS.
Here is a simplified version of the terrain (yes it's just a plane) which don't exhibit the issue:
let width:Float = 12
let depth:Float = 12
let height:Float = 2
let nx = 6
let nz = 6
func build() -> SCNGeometry {
var vertices : [SCNVector3] = Array()
for i in 0..<(nx + 1) {
for j in 0..<(nz + 1) {
let x = (Float(i) / Float(nx)) * width - width/2
let z = (Float(j) / Float(nz)) * depth - depth/2
let y = Float(0)
vertices.append(SCNVector3(x:x, y:y, z:z))
}
}
var indices : [CInt] = []
for i in 0..<nx {
for j in 0..<nz {
indices.append(CInt(i + j * (nz+1)))
indices.append(CInt(i+1 + j * (nz+1)))
indices.append(CInt(i + (j+1)*(nz+1)))
indices.append(CInt(i+1 + j * (nz+1)))
indices.append(CInt(i+1 + (j+1)*(nz+1)))
indices.append(CInt(i + (j+1)*(nz+1)))
}
}
let data = NSData(bytes: vertices, length: sizeof(SCNVector3) * countElements(vertices))
let vertexSource = SCNGeometrySource(data: data, semantic: SCNGeometrySourceSemanticVertex, vectorCount: vertices.count, floatComponents: true, componentsPerVector: 3, bytesPerComponent: sizeof(Float), dataOffset: 0, dataStride: sizeof(SCNVector3))
let indexData = NSData(bytes: indices, length: sizeof(CInt) * countElements(indices))
let element = SCNGeometryElement(data: indexData, primitiveType: SCNGeometryPrimitiveType.Triangles, primitiveCount: indices.count, bytesPerIndex: sizeof(CInt))
return SCNGeometry(sources: [vertexSource], elements: [element])
}
Now change nx and nz to:
let nx = 8
let nz = 8
Crash
This seems very much linked with the number of indices but at ~300 I don't believe I should be hitting a limit.
Any suggestion, help or solution very much appreciated. Thanks.
The problem could be that you're passing primitiveCount: indices.count when creating the SCNGeometryElement rather than indices.count/3 (since there are three indices per triangle). I'm surprised there's no earlier bounds checking, but without that, you could certainly see a crash depending on the number of indices.

Resources