I'd like to return a dask dataframe from an overlapping dask array computation, where each block's computation returns a pandas dataframe. The example below shows one way to do this, simplified for demonstration purposes. I've found a combination of da.overlap.overlap and to_delayed().ravel() as able to get the job done, if I pass in the relevant block key and chunk information.
Edit:
Thanks to a #AnnaM who caught bugs in the original post and then made it general! Building off of her comments, I'm including an updated version of the code. Also, in responding to Anna's interest in memory usage, I verified that this does not seem to take up more memory than naively expected.
def extract_features_generalized(chunk, offsets, depth, columns):
shape = np.asarray(chunk.shape)
offsets = np.asarray(offsets)
depth = np.asarray(depth)
coordinates = np.stack(np.nonzero(chunk)).T
keep = ((coordinates >= depth) & (coordinates < (shape - depth))).all(axis=1)
data = coordinates + offsets - depth
df = pd.DataFrame(data=data, columns=columns)
return df[keep]
def my_overlap_generalized(data, chunksize, depth, columns, boundary):
data = data.rechunk(chunksize)
data_overlapping_chunks = da.overlap.overlap(data, depth=depth, boundary=boundary)
dfs = []
for block in data_overlapping_chunks.to_delayed().ravel():
offsets = np.array(block.key[1:]) * np.array(data.chunksize)
df_block = dask.delayed(extract_features_generalized)(block, offsets=offsets,
depth=depth, columns=columns)
dfs.append(df_block)
return dd.from_delayed(dfs)
data = np.zeros((2,4,8,16,16))
data[0,0,4,2,2] = 1
data[0,1,4,6,2] = 1
data[1,2,4,8,2] = 1
data[0,3,4,2,2] = 1
arr = da.from_array(data)
df = my_overlap_generalized(arr,
chunksize=(-1,-1,-1,8,8),
depth=(0,0,0,2,2),
columns=['r', 'c', 'z', 'y', 'x'],
boundary=tuple(['reflect']*5))
df.compute().reset_index()
-- Remainder of original post, including original bugs --
My example only does xy overlaps, but it's easy to generalize. Is there anything below that is suboptimal or could be done better? Is anything likely to break because it's relying on low-level information that could change (e.g. block key)?
def my_overlap(data, chunk_xy, depth_xy):
data = data.rechunk((-1,-1,-1, chunk_xy, chunk_xy))
data_overlapping_chunks = da.overlap.overlap(data,
depth=(0,0,0,depth_xy,depth_xy),
boundary={3: 'reflect', 4: 'reflect'})
dfs = []
for block in data_overlapping_chunks.to_delayed().ravel():
offsets = np.array(block.key[1:]) * np.array(data.chunksize)
df_block = dask.delayed(extract_features)(block, offsets=offsets, depth_xy=depth_xy)
dfs.append(df_block)
# All computation is delayed, so downstream comptutions need to know the format of the data. If the meta
# information is not specified, a single computation will be done (which could be expensive) at this point
# to infer the metadata.
# This empty dataframe has the index, column, and type information we expect in the computation.
columns = ['r', 'c', 'z', 'y', 'x']
# The dtypes are float64, except for a small number of columns
df_meta = pd.DataFrame(columns=columns, dtype=np.float64)
df_meta = df_meta.astype({'c': np.int64, 'r': np.int64})
df_meta.index.name = 'feature'
return dd.from_delayed(dfs, meta=df_meta)
def extract_features(chunk, offsets, depth_xy):
r, c, z, y, x = np.nonzero(chunk)
df = pd.DataFrame({'r': r, 'c': c, 'z': z, 'y': y+offsets[3]-depth_xy,
'x': x+offsets[4]-depth_xy})
df = df[(df.y > depth_xy) & (df.y < (chunk.shape[3] - depth_xy)) &
(df.z > depth_xy) & (df.z < (chunk.shape[4] - depth_xy))]
return df
data = np.zeros((2,4,8,16,16)) # round, channel, z, y, x
data[0,0,4,2,2] = 1
data[0,1,4,6,2] = 1
data[1,2,4,8,2] = 1
data[0,3,4,2,2] = 1
arr = da.from_array(data)
df = my_overlap(arr, chunk_xy=8, depth_xy=2)
df.compute().reset_index()
First of all, thanks for posting your code. I am working on a similar problem and this was really helpful for me.
When testing your code, I discovered a few mistakes in the extract_features function that prevent your code from returning correct indices.
Here is a corrected version:
def extract_features(chunk, offsets, depth_xy):
r, c, z, y, x = np.nonzero(chunk)
df = pd.DataFrame({'r': r, 'c': c, 'z': z, 'y': y, 'x': x})
df = df[(df.y >= depth_xy) & (df.y < (chunk.shape[3] - depth_xy)) &
(df.x >= depth_xy) & (df.x < (chunk.shape[4] - depth_xy))]
df['y'] = df['y'] + offsets[3] - depth_xy
df['x'] = df['x'] + offsets[4] - depth_xy
return df
The updated code now returns the indices that were set to 1:
index r c z y x
0 0 0 0 4 2 2
1 1 0 1 4 6 2
2 2 0 3 4 2 2
3 1 1 2 4 8 2
For comparison, this is the output of the original version:
index r c z y x
0 1 0 1 4 6 2
1 3 1 2 4 8 2
2 0 0 1 4 6 2
3 1 1 2 4 8 2
It returns lines number 2 and 4, two times each.
The reason why this happens is three mistakes in the extract_features function:
You first add the offset and subtract the depth and then filter out the overlapping parts: the order needs to be swapped
df.y > depth_xy should be replaced with df.y >= depth_xy
df.z should be replaced with df.x, since it is the x dimension that has an overlap
To optimize this even further, here is a generalized version of the code that would work for an arbitrary number of dimension:
def extract_features_generalized(chunk, offsets, depth, columns):
coordinates = np.nonzero(chunk)
df = pd.DataFrame()
rows_to_keep = np.ones(len(coordinates[0]), dtype=int)
for i in range(len(columns)):
df[columns[i]] = coordinates[i]
rows_to_keep = rows_to_keep * np.array((df[columns[i]] >= depth[i])) * \
np.array((df[columns[i]] < (chunk.shape[i] - depth[i])))
df[columns[i]] = df[columns[i]] + offsets[i] - depth[i]
del coordinates
return df[rows_to_keep > 0]
def my_overlap_generalized(data, chunksize, depth, columns):
data = data.rechunk(chunksize)
data_overlapping_chunks = da.overlap.overlap(data, depth=depth,
boundary=tuple(['reflect']*len(columns)))
dfs = []
for block in data_overlapping_chunks.to_delayed().ravel():
offsets = np.array(block.key[1:]) * np.array(data.chunksize)
df_block = dask.delayed(extract_features_generalized)(block, offsets=offsets,
depth=depth, columns=columns)
dfs.append(df_block)
return dd.from_delayed(dfs)
data = np.zeros((2,4,8,16,16))
data[0,0,4,2,2] = 1
data[0,1,4,6,2] = 1
data[1,2,4,8,2] = 1
data[0,3,4,2,2] = 1
arr = da.from_array(data)
df = my_overlap_generalized(arr, chunksize=(-1,-1,-1,8,8),
depth=(0,0,0,2,2), columns=['r', 'c', 'z', 'y', 'x'])
df.compute().reset_index()
I am trying to return a pair of sums using the let construct in sml. Every way I have tried will only return one value. I have tried creating a list by using cons (::) and then returning the list, but that gives an error as well.
val t = [(3,4), (4,5), (5,6)];
fun sumPairs(nil) = 0
| sumPairs((x,y)::zs) =
let
val sumFirst = x + sumPairs(zs)
val sumSecond = y + sumPairs(zs)
in
(sumFirst, sumSecond) <how would I return this as a tuple or list?>
end;
sumPairs(t);
The problem is not with (sumFirst, sumSecond) or with let specifically, but with the rest of your code.
The base case and the recursions say that sumPairs produces an int, not a pair of ints.
Because of this, there is a conflict when you try produce a pair.
Your base case should be (0,0), not 0, since it must be a pair.
You also need to deconstruct the result from the recursion since that produces a pair, not an integer.
Like this
fun sumPairs nil = (0, 0)
| sumPairs ((x,y)::zs) =
let
val (sumFirst, sumSecond) = sumPairs zs
in
(x + sumFirst, y + sumSecond)
end;
I have 2 arrays say arrayA & arrayB. arrayA has the elements say [1,2] and arrayB has the elements say [3,4]. Now I want to multiply and add the elements in these arrays like so.. 1x3 + 2x4 = 11. How can I achieve this...?
Here a combo of zip, map and reduce:
let result = (zip([1,2], [3,4]).map { $0.0 * $0.1 }).reduce(0, +)
print(result) // 11
Zip makes a sequence of pairs based on the two arrays: (1,3), (2,4)
with map I am iterating for each element of the array producing at each iteration a new element
$0 means the element of the sequence at the current iteration. Since the element is a pair (because of zip), we can access to the first sub-element of the pair with $0.0 and to the second with $0.1.
finally (after map) we get an array of products, just need to "reduce" it to a number, summing all the resulting elements with reduce.
(0, +) means that reduce starts from 0 as initial value, then with the abbreviation + we can accumulate the sums of all the elements.
Note that rather than using a chained map and reduce (for multiplication and summation, respectively), you could directly apply the reduce operation on the zipped sequence, and modify the reduce closure to accordingly calculate the sum of the pair-wise multiplied objects in the zipped sequence:
let a = [1, 2]
let b = [3, 4]
let result = zip(a,b).reduce(0) { $0 + $1.0 * $1.1 } // 11
Try this.
let A = [1,2]
let B = [3,4]
let C = zip(A, B).map {$0.0 * $0.1}
print(C) // [3, 8]
let sum = C.reduce(0, +)
print(sum)//11
If I have two unknown values, lets say x and y, what is the best way loop through all of the values between between those values?
For example, given the values x = 0 and y = 5 I would like to do something with the values 0, 1, 2, 3, 4, and 5. The result could exclude 0 and 5 if this is simpler.
Using Swift's Range operator, I could do something like this:
for i in x...y {
// Do something with i
}
Except I do not know if x or y is the greater value.
The Swift documentation for Range Operators states:
The closed range operator (a...b) defines a range that runs from a to b, and includes the values a and b. The value of a must not be greater than b.
There are a number of solutions here. A pretty straight forward one is:
let diff = y - x
for i in 0...abs(diff) {
let value = min(x, y) + i
// Do something with value
}
Is there a better, or more elegant way to achieve this?
I guess the most explicit way of writing it would be:
for i in min(a, b)...max(a, b) {
// Do something with i
}
To exclude the first and last value, you can increment your lower limit and use the Swift ..< syntax:
let lowerLimit = min(a, b) + 1
let upperLimit = max(a, b)
for i in lowerLimit..<upperLimit {
// Do something with i
}
Consider the following code:
let dl = 9.5 / 11.
let min = 21.5 + dl
let max = 40.5 - dl
let a = [ for z in min .. dl .. max -> z ] // should have 21 elements
let b = a.Length
"a" should have 21 elements but has got only 20 elements. The "max - dl" value is missing. I understand that float numbers are not precise, but I hoped that F# could work with that. If not then why F# supports List comprehensions with float iterator? To me, it is a source of bugs.
Online trial: http://tryfs.net/snippets/snippet-3H
Converting to decimals and looking at the numbers, it seems the 21st item would 'overshoot' max:
let dl = 9.5m / 11.m
let min = 21.5m + dl
let max = 40.5m - dl
let a = [ for z in min .. dl .. max -> z ] // should have 21 elements
let b = a.Length
let lastelement = List.nth a 19
let onemore = lastelement + dl
let overshoot = onemore - max
That is probably due to lack of precision in let dl = 9.5m / 11.m?
To get rid of this compounding error, you'll have to use another number system, i.e. Rational. F# Powerpack comes with a BigRational class that can be used like so:
let dl = 95N / 110N
let min = 215N / 10N + dl
let max = 405N / 10N - dl
let a = [ for z in min .. dl .. max -> z ] // Has 21 elements
let b = a.Length
Properly handling float precision issues can be tricky. You should not rely on float equality (that's what list comprehension implicitely does for the last element). List comprehensions on float are useful when you generate an infinite stream. In other cases, you should pay attention to the last comparison.
If you want a fixed number of elements, and include both lower and upper endpoints, I suggest you write this kind of function:
let range from to_ count =
assert (count > 1)
let count = count - 1
[ for i = 0 to count do yield from + float i * (to_ - from) / float count]
range 21.5 40.5 21
When I know the last element should be included, I sometimes do:
let a = [ for z in min .. dl .. max + dl*0.5 -> z ]
I suspect the problem is with the precision of floating point values. F# adds dl to the current value each time and checks if current <= max. Because of precision problems, it might jump over max and then check if max+ε <= max (which will yield false). And so the result will have only 20 items, and not 21.
After running your code, if you do:
> compare a.[19] max;;
val it : int = -1
It means max is greater than a.[19]
If we do calculations the same way the range operator does but grouping in two different ways and then compare them:
> compare (21.5+dl+dl+dl+dl+dl+dl+dl+dl) ((21.5+dl)+(dl+dl+dl+dl+dl+dl+dl));;
val it : int = 0
> compare (21.5+dl+dl+dl+dl+dl+dl+dl+dl+dl) ((21.5+dl)+(dl+dl+dl+dl+dl+dl+dl+dl));;
val it : int = -1
In this sample you can see how adding 7 times the same value in different order results in exactly the same value but if we try it 8 times the result changes depending on the grouping.
You're doing it 20 times.
So if you use the range operator with floats you should be aware of the precision problem.
But the same applies to any other calculation with floats.