I'd like to return a dask dataframe from an overlapping dask array computation, where each block's computation returns a pandas dataframe. The example below shows one way to do this, simplified for demonstration purposes. I've found a combination of da.overlap.overlap and to_delayed().ravel() as able to get the job done, if I pass in the relevant block key and chunk information.
Edit:
Thanks to a #AnnaM who caught bugs in the original post and then made it general! Building off of her comments, I'm including an updated version of the code. Also, in responding to Anna's interest in memory usage, I verified that this does not seem to take up more memory than naively expected.
def extract_features_generalized(chunk, offsets, depth, columns):
shape = np.asarray(chunk.shape)
offsets = np.asarray(offsets)
depth = np.asarray(depth)
coordinates = np.stack(np.nonzero(chunk)).T
keep = ((coordinates >= depth) & (coordinates < (shape - depth))).all(axis=1)
data = coordinates + offsets - depth
df = pd.DataFrame(data=data, columns=columns)
return df[keep]
def my_overlap_generalized(data, chunksize, depth, columns, boundary):
data = data.rechunk(chunksize)
data_overlapping_chunks = da.overlap.overlap(data, depth=depth, boundary=boundary)
dfs = []
for block in data_overlapping_chunks.to_delayed().ravel():
offsets = np.array(block.key[1:]) * np.array(data.chunksize)
df_block = dask.delayed(extract_features_generalized)(block, offsets=offsets,
depth=depth, columns=columns)
dfs.append(df_block)
return dd.from_delayed(dfs)
data = np.zeros((2,4,8,16,16))
data[0,0,4,2,2] = 1
data[0,1,4,6,2] = 1
data[1,2,4,8,2] = 1
data[0,3,4,2,2] = 1
arr = da.from_array(data)
df = my_overlap_generalized(arr,
chunksize=(-1,-1,-1,8,8),
depth=(0,0,0,2,2),
columns=['r', 'c', 'z', 'y', 'x'],
boundary=tuple(['reflect']*5))
df.compute().reset_index()
-- Remainder of original post, including original bugs --
My example only does xy overlaps, but it's easy to generalize. Is there anything below that is suboptimal or could be done better? Is anything likely to break because it's relying on low-level information that could change (e.g. block key)?
def my_overlap(data, chunk_xy, depth_xy):
data = data.rechunk((-1,-1,-1, chunk_xy, chunk_xy))
data_overlapping_chunks = da.overlap.overlap(data,
depth=(0,0,0,depth_xy,depth_xy),
boundary={3: 'reflect', 4: 'reflect'})
dfs = []
for block in data_overlapping_chunks.to_delayed().ravel():
offsets = np.array(block.key[1:]) * np.array(data.chunksize)
df_block = dask.delayed(extract_features)(block, offsets=offsets, depth_xy=depth_xy)
dfs.append(df_block)
# All computation is delayed, so downstream comptutions need to know the format of the data. If the meta
# information is not specified, a single computation will be done (which could be expensive) at this point
# to infer the metadata.
# This empty dataframe has the index, column, and type information we expect in the computation.
columns = ['r', 'c', 'z', 'y', 'x']
# The dtypes are float64, except for a small number of columns
df_meta = pd.DataFrame(columns=columns, dtype=np.float64)
df_meta = df_meta.astype({'c': np.int64, 'r': np.int64})
df_meta.index.name = 'feature'
return dd.from_delayed(dfs, meta=df_meta)
def extract_features(chunk, offsets, depth_xy):
r, c, z, y, x = np.nonzero(chunk)
df = pd.DataFrame({'r': r, 'c': c, 'z': z, 'y': y+offsets[3]-depth_xy,
'x': x+offsets[4]-depth_xy})
df = df[(df.y > depth_xy) & (df.y < (chunk.shape[3] - depth_xy)) &
(df.z > depth_xy) & (df.z < (chunk.shape[4] - depth_xy))]
return df
data = np.zeros((2,4,8,16,16)) # round, channel, z, y, x
data[0,0,4,2,2] = 1
data[0,1,4,6,2] = 1
data[1,2,4,8,2] = 1
data[0,3,4,2,2] = 1
arr = da.from_array(data)
df = my_overlap(arr, chunk_xy=8, depth_xy=2)
df.compute().reset_index()
First of all, thanks for posting your code. I am working on a similar problem and this was really helpful for me.
When testing your code, I discovered a few mistakes in the extract_features function that prevent your code from returning correct indices.
Here is a corrected version:
def extract_features(chunk, offsets, depth_xy):
r, c, z, y, x = np.nonzero(chunk)
df = pd.DataFrame({'r': r, 'c': c, 'z': z, 'y': y, 'x': x})
df = df[(df.y >= depth_xy) & (df.y < (chunk.shape[3] - depth_xy)) &
(df.x >= depth_xy) & (df.x < (chunk.shape[4] - depth_xy))]
df['y'] = df['y'] + offsets[3] - depth_xy
df['x'] = df['x'] + offsets[4] - depth_xy
return df
The updated code now returns the indices that were set to 1:
index r c z y x
0 0 0 0 4 2 2
1 1 0 1 4 6 2
2 2 0 3 4 2 2
3 1 1 2 4 8 2
For comparison, this is the output of the original version:
index r c z y x
0 1 0 1 4 6 2
1 3 1 2 4 8 2
2 0 0 1 4 6 2
3 1 1 2 4 8 2
It returns lines number 2 and 4, two times each.
The reason why this happens is three mistakes in the extract_features function:
You first add the offset and subtract the depth and then filter out the overlapping parts: the order needs to be swapped
df.y > depth_xy should be replaced with df.y >= depth_xy
df.z should be replaced with df.x, since it is the x dimension that has an overlap
To optimize this even further, here is a generalized version of the code that would work for an arbitrary number of dimension:
def extract_features_generalized(chunk, offsets, depth, columns):
coordinates = np.nonzero(chunk)
df = pd.DataFrame()
rows_to_keep = np.ones(len(coordinates[0]), dtype=int)
for i in range(len(columns)):
df[columns[i]] = coordinates[i]
rows_to_keep = rows_to_keep * np.array((df[columns[i]] >= depth[i])) * \
np.array((df[columns[i]] < (chunk.shape[i] - depth[i])))
df[columns[i]] = df[columns[i]] + offsets[i] - depth[i]
del coordinates
return df[rows_to_keep > 0]
def my_overlap_generalized(data, chunksize, depth, columns):
data = data.rechunk(chunksize)
data_overlapping_chunks = da.overlap.overlap(data, depth=depth,
boundary=tuple(['reflect']*len(columns)))
dfs = []
for block in data_overlapping_chunks.to_delayed().ravel():
offsets = np.array(block.key[1:]) * np.array(data.chunksize)
df_block = dask.delayed(extract_features_generalized)(block, offsets=offsets,
depth=depth, columns=columns)
dfs.append(df_block)
return dd.from_delayed(dfs)
data = np.zeros((2,4,8,16,16))
data[0,0,4,2,2] = 1
data[0,1,4,6,2] = 1
data[1,2,4,8,2] = 1
data[0,3,4,2,2] = 1
arr = da.from_array(data)
df = my_overlap_generalized(arr, chunksize=(-1,-1,-1,8,8),
depth=(0,0,0,2,2), columns=['r', 'c', 'z', 'y', 'x'])
df.compute().reset_index()
I want to generate a 2 dimensional periodic pattern from a pseudorandom binary sequence like this one with the following context:
A periodic pattern satisfies the equations (1) and (2)
W(x + q0N0, y) = W(x, y); q0, N0 >1 (1)
W(x, y + q1N1) = W(x, y); q1, N1 > 1, (2)
where N0 and N1 determine the periodicity of repetitions and q0 and q1 a repetition number on the horizontal and vertical directions. Generation from pseudorandom values {−1, 1} produces a rectangular, binary valued pattern.
One way to achieve that would be to take a small pseudo-random 2D pattern (the sequence) and repeat it periodically so that neighboring tiles are always mirrored giving a sense of smooth continuity.
After you specified your requirements with W(x + q0N0, y) = W(x, y) and W(x, y + q1N1) = W(x, y); it becomes clear, that this is exactly (without the mirroring part) what you want.
You simply have to repeat a random pattern a certain number of times in both directions.
Example (similar to your image where the period length in the vertical direction is longer than in the horizontal direction)
Code (in Matlab)
% base pattern
N0 = 20;
N1 = 5;
base = rand([N0, N1]) > 0.5; % pseudo-random
% periodically repeating the pattern
Q0 = 5;
Q1 = 20;
pattern = zeros([N0*Q0,N1*Q1]);
for q0 = 1 : Q0
for q1 = 1 : Q1
pattern((q0-1)*N0+1:q0*N0, (q1-1)*N1+1:q1*N1) = base;
end
end
% save
imwrite(pattern, 'test.jpg');
% display
imagesc(pattern);
axis image;
colormap(gray);
The first lines just compute a random, binary pattern of a certain size N0 x N1
% base pattern
N0 = 20;
N1 = 5;
base = rand([N0, N1]) > 0.5; % pseudo-random
Then comes the definition of the number of repetitions of the pattern in each direction
Q0 = 5;
Q1 = 20;
Finally, in two nested but rather simple for loops, the base pattern is repeated
pattern = zeros([N0*Q0,N1*Q1]);
for q0 = 1 : Q0
for q1 = 1 : Q1
pattern((q0-1)*N0+1:q0*N0, (q1-1)*N1+1:q1*N1) = base;
end
end
The computation of the indices (where to place the base patterns) fulfills your requirement equations
(q0-1)*N0+1:q0*N0, (q1-1)*N1+1:q1*N1
Old Example (with mirroring)
Code (in Matlab)
% base pattern
N = 20;
base = rand(N) > 0.5; % pseudo-random
% multiplying the pattern
M = 4;
pattern = zeros(N*M);
for i = 1 : M
for j = 1 : M
b = base;
% mirroring the base
if mod(i, 2) == 1
flip(b, 2);
end
if mod(j, 2) == 1
flip(b, 1);
end
pattern((i-1)*N+1:i*N, (j-1)*N+1:j*N) = b;
end
end
% save
imwrite(pattern, 'test.jpg');
% display
imagesc(pattern);
axis image;
colormap(gray);
The patterns are flipped (mirrored) in one or two direction sometimes to simulates some kind of smoothness (symmetry) of the pattern.
Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 5 years ago.
Improve this question
Sorry I have tried a lot to solve this recurrence equation
T (n) = 3T (n / 3) + Ѳ (log3n)
with the replacement method but I can not get the required result:
1) T (n) = O (nlogn)
2) Induction
Base: for every n = 1 -> 1log1 + 1 = 1 = T (1)
Inductive step: T (k) = klogk + k for each k <n
Use k = n / 3
T (n) = 3T (n / 3) + Ѳ (log₃n)
1) T (n) = O (nlogn)
2) Induction
Base: for every n = 1 -> 1log1 + 1 = 1 = T (1)
Inductive step: T (k) = klogk + k for each k <n
Use k = n / 3
T (n) = 3T (n / 3) + Ѳ (log₃n)
= 3 [n / 3logn / 3 + n / 3] + (log₃n)
= nlogn / 3 + n + (log₃n)
= n(logn-log3) + n + (log₃n)
= nlogn-nlog3 + n + (log3n)
Firstly we can (eventually) ignore the base 3 in the Theta-notation, as it amounts to a multiplicative factor as is therefore irrelevant. Then we can try the following method:
1. Hypothesis by inspection:
If we re-substitute T into itself multiple times, we get:
What is the upper limit m? We need to assume that T(n) has a stopping condition, i.e. some value of n where it stops recursing. Assuming that it is n = 1 (it really doesn't matter, as long as it's a constant much smaller than n). Continuing (and briefly restoring the base 3):
Surprisingly the answer is not Ө(n log n).
2. Induction base case
We don't use induction to prove the final result, but the series result we deduced by inspecting the behaviour of the expansion.
For the base case n / 3 = 1, we have:
Which is consistent.
3. Induction recurrence
Again, consistent. Thus by induction the summation result is correct, and T(n) is indeed Ө(n).
4. Numerical tests:
Just in case you still cannot believe that it is Ө(n), here is a numerical test to prove the result.
Javascript code:
function T(n) {
return n <= 1 ? 0 : 3*T(floor(n/3)) + log(n);
}
Results:
n T(n)
--------------------------
10 5.598421959
100 66.33828212
1000 702.3597066
10000 6450.185742
100000 63745.45154
1000000 580674.1886
10000000 8924162.276
100000000 81068207.64
Graph:
The linear relationship is clear.
I am working on a client's site, and I'm writing an amortization schedule calculator in in ruby on rails. For longer loan term calculations, it doesn't seem to be breaking when the balance reaches 0
Here is my code:
def calculate_amortization_results
p = params[:price].to_i
i = params[:rate].to_d
l = params[:term].to_i
j = i/(12*100)
n = l * 12
m = p * (j / (1 - (1 + j) ** (-1 * n)))
#loanAmount = p
#rateAmount = i
#monthlyAmount = m
#amort = []
#interestAmount = 0
while p > 0
line = Hash.new
h = p*j
c = m-h
p = p-c
line["interest"] = h
line["principal"] = c
if p <= 0
line["balance"] = 0
else
line["balance"] = p
end
line["payment"] = h+c
#amort.push(line)
#interestAmount += h
end
end
And here is the view:
- #amort.each_with_index do |a, i|
%li
.m
= i+1
.i
= number_to_currency(a["interest"], :unit => "$")
.p
= number_to_currency(a["principal"], :unit => "$")
.pp
= number_to_currency(a["payment"], :unit => "$")
.b
= number_to_currency(a["balance"], :unit => "$")
What I am seeing is, in place of $0.00 in the final payment balance, it shows "-$-inf", iterates one more loop, then displays $0.00, but shows "-$-inf" for interest. It should loop until p gets to 0, then stop and set the balance as 0, but it isn't. Any idea what I've done wrong?
The calculator is here. It seems to work fine for shorter terms, like 5 years, but longer terms cause the above error.
Edit:
Changing the while loop to n.times do
and then changing the balance view to
= number_to_currency(a["balance"], :unit => "$", :negative_format => "$0.00")
Is a workaround, but i'd like to know why the while loop wouldn't work correctly
in Ruby the default for numerical values is Fixnum ... e.g.:
> 15 / 4
=> 3
You will see weird rounding errors if you try to use Fixnum values and divide them.
To make sure that you use Floats, at least one of the numbers in the calculation needs to be a Float
> 15.0 / 4
=> 3.75
> 15 / 4.0
=> 3.75
You do two comparisons against 0 , which should be OK if you make sure that p is a Float.
As the other answer suggests, you should use "decimal" type in your database to represent currency.
Please try if this will work:
def calculate_amortization_results
p = params[:price].to_f # instead of to_i
i = params[:rate].to_f # <-- what is to_d ? use to_f
l = params[:term].to_i
j = i/(12*100.0) # instead of 100
n = l * 12
m = p * (j / (1 - (1 + j) ** (-1 * n))) # division by zero if i==0 ==> j==0
#loanAmount = p
#rateAmount = i
#monthlyAmount = m
#amort = []
#interestAmount = 0.0 # instead of 0
while p > 0
line = Hash.new
h = p*j
c = m-h
p = p-c
line["interest"] = h
line["principal"] = c
if p <= 0
line["balance"] = 0
else
line["balance"] = p
end
line["payment"] = h+c
#amort.push(line)
#interestAmount += h
end
end
If you see "inf" in your output, you are doing a division by zero somewhere.. better check the logic of your calculation, and guard against division by zero.
according to Wikipedia the formula is:
http://en.wikipedia.org/wiki/Amortization_calculator
to improve rounding errors, it's probably better to re-structure the formula like this:
m = (p * j) / (1 - (1 + j) ** (-1 * n) # these are two divisions! x**-1 == 1/x
which is equal to:
m = (p * j) + (p * j) / ((1 + j) ** n) - 1.0)
which is equal to: (use this one)
q = p * j # this is much larger than 1 , so fewer rounding errors when dividing it by something
m = q + q / ((1 + j) ** n) - 1.0) # only one division
I think it has something to do with the floating point operations precision. It has already been discussed here: Ruby number precision with simple arithmetic and it would be better to use decimal format for financial purposes.
The answer could be computing the numbers in the loop, but with precomputed number of iterations and from the scratch.