The following code showed that generating a sequence using a sequence expression containing for was approximately five times faster than generating the same sequence using Seq.init.
open System
let rand count =
let rnd = Random() // if this value is not created all numbers are equal
seq {for i in 0..(count - 1) -> rnd.NextDouble()}
/// Perhaps more "functional" than rand but slower
let rand2 count =
let rnd = Random()
let rnd2 (i: int) = rnd.NextDouble()
Seq.init count rnd2
> rand 1000000 |> List.ofSeq |> List.head;;
Real: 00:00:00.092, CPU: 00:00:00.093, GC gen0: 3, gen1: 2, gen2: 0
val it : float = 0.1358240168
> rand2 1000000 |> List.ofSeq |> List.head;;
Real: 00:00:00.473, CPU: 00:00:00.484, GC gen0: 21, gen1: 20, gen2: 1
val it : float = 0.4128856414
Questions:
1) What is the reason for the speed difference?
2) Is the Seq.init alternative in some sense "more functional" then the sequence expression alternative?
3) Are the two alternatives equivalent in terms of thread-safety?
What is the reason for the speed difference?
Seq.init is slow because it uses Seq.upto which is slow. Seq.upto is slow mainly because it creates a Lazy instance for every object in the pipeline. This also explains the GC pressure.
In the current state of Fsharp.Core, if you need performance Seq isn't the right option.
This will change though when the manostick PR is merged.
In addition, even
seq {for i in 0..(count - 1) -> rnd.NextDouble()}
is slow compared to pipelines such as nessos or manostick improved Seq.
Is the Seq.init alternative in some sense "more functional" then the sequence expression alternative?
Sequence expressions aka sequence comprehension is related to set comprehensions in mathematics. IMO both have functional "taste" to them.
Are the two alternatives equivalent in terms of thread-safety?
Yes, in that neither provide thread-safety.
PS. Another reason Seq and LINQ are slow is that they rely on pull pipelines. Push pipelines are faster. Nessos and manofstick pipelines support both AFAICT and choose push if possible.
P.S. I wrote a quick little performance comparison of different pipelines. The result is sum not a list to isolate the actual pipeline performance, not the cost of creating lists. I also vary the number of inner and outer iterations in order to detect overhead from creating pipelines:
// A simplistic push-pipeline
type Receiver<'T> = 'T -> bool
type Stream<'T> = Receiver<'T> -> unit
module Stream =
let inline init count gen : Stream<_> =
fun r ->
let rec loop i =
if i < count && r (gen i) then
loop (i + 1)
loop 0
let inline sum (s : Stream<_>) : _ =
let mutable a = LanguagePrimitives.GenericZero<_>
s (fun v -> a <- a + v; true)
a
let rnd = System.Random ()
let gen = fun _ -> rnd.NextDouble ()
let clock =
let sw = System.Diagnostics.Stopwatch ()
sw.Start ()
fun () -> sw.ElapsedMilliseconds
open System
let timeIt n a =
let r = a () // Warm-up
GC.Collect (2, GCCollectionMode.Forced, true)
let inline cc g = GC.CollectionCount g
let bcc0, bcc1, bcc2 = cc 0, cc 1, cc 2
let before = clock ()
for i = 1 to n do
a () |> ignore
let after = clock ()
let acc0, acc1, acc2 = cc 0, cc 1, cc 2
after - before, acc0 - bcc0, acc1 - bcc1, acc2 - bcc2, r
open System.Linq
[<EntryPoint>]
let main argv =
let count = 10000000
let outers =
[|
1000000
10000
100
1
|]
for outer in outers do
let inner = count / outer
let testCases =
[|
"Push stream" , fun () -> Stream.init inner gen |> Stream.sum
"LINQ" , fun () -> (Enumerable.Range (0, inner)).Select(gen).Sum()
"Seq.init" , fun () -> Seq.init inner gen |> Seq.sum
"Seq comprehension" , fun () -> seq { for i in 0..inner - 1 -> gen i } |> Seq.sum
"Tail-recursion" , fun () ->
let rec loop a i =
if i < inner then
loop (a + gen i) (i + 1)
else
a
loop 0. 0
|]
printfn "Using outer = %A, inner = %A, total is: %A" outer inner count
for nm, a in testCases do
printfn " Running test case: %A" nm
let tm, cc0, cc1, cc2, r = timeIt outer a
printfn " it took %A ms (%A, %A, %A), result is: %A" tm cc0 cc1 cc2 r
0
The results are as follows (.NET 4.6.2, x64, Release):
Using outer = 1000000, inner = 10, total is: 10000000
Running test case: "Push stream"
it took 145L ms (22, 0, 0), result is: 5.348407591
Running test case: "LINQ"
it took 296L ms (63, 0, 0), result is: 5.056716735
Running test case: "Seq.init"
it took 1490L ms (724, 0, 0), result is: 3.977087705
Running test case: "Seq comprehension"
it took 333L ms (66, 0, 0), result is: 5.208401204
Running test case: "Tail-recursion"
it took 109L ms (0, 0, 0), result is: 5.898073628
Using outer = 10000, inner = 1000, total is: 10000000
Running test case: "Push stream"
it took 118L ms (0, 0, 0), result is: 510.943297
Running test case: "LINQ"
it took 210L ms (0, 0, 0), result is: 488.3970571
Running test case: "Seq.init"
it took 1411L ms (661, 0, 0), result is: 505.2632877
Running test case: "Seq comprehension"
it took 264L ms (0, 0, 0), result is: 502.1710107
Running test case: "Tail-recursion"
it took 101L ms (0, 0, 0), result is: 487.9451813
Using outer = 100, inner = 100000, total is: 10000000
Running test case: "Push stream"
it took 118L ms (0, 0, 0), result is: 49850.99306
Running test case: "LINQ"
it took 202L ms (0, 0, 0), result is: 50113.06557
Running test case: "Seq.init"
it took 1398L ms (661, 0, 0), result is: 49923.14521
Running test case: "Seq comprehension"
it took 262L ms (0, 0, 0), result is: 50196.00191
Running test case: "Tail-recursion"
it took 98L ms (0, 0, 0), result is: 49878.16573
Using outer = 1, inner = 10000000, total is: 10000000
Running test case: "Push stream"
it took 117L ms (0, 0, 0), result is: 5000088.583
Running test case: "LINQ"
it took 201L ms (0, 0, 0), result is: 5000569.657
Running test case: "Seq.init"
it took 1388L ms (661, 0, 0), result is: 5000169.339
Running test case: "Seq comprehension"
it took 260L ms (0, 0, 0), result is: 5000263.083
Running test case: "Tail-recursion"
it took 97L ms (0, 0, 0), result is: 4999990.197
Press any key to continue . . .
So Seq.init does the worst and "Tail-recursion" (essentially a loop) does the best in both CPU performance and memory usage.
Actually generating the random numbers itself takes some time, so if I use id instead, the numbers look like this:
Using outer = 1000000, inner = 10, total is: 10000000
Running test case: "Push stream"
it took 47L ms (22, 0, 0), result is: 45.0
Running test case: "LINQ"
it took 211L ms (63, 0, 0), result is: 45.0
Running test case: "Seq.init"
it took 1364L ms (724, 0, 0), result is: 45.0
Running test case: "Seq comprehension"
it took 241L ms (66, 0, 0), result is: 45.0
Running test case: "Tail-recursion"
it took 10L ms (0, 0, 0), result is: 45.0
Using outer = 10000, inner = 1000, total is: 10000000
Running test case: "Push stream"
it took 28L ms (0, 0, 0), result is: 499500.0
Running test case: "LINQ"
it took 125L ms (0, 0, 0), result is: 499500.0
Running test case: "Seq.init"
it took 1285L ms (661, 0, 0), result is: 499500.0
Running test case: "Seq comprehension"
it took 170L ms (0, 0, 0), result is: 499500.0
Running test case: "Tail-recursion"
it took 8L ms (0, 0, 0), result is: 499500.0
Using outer = 100, inner = 100000, total is: 10000000
Running test case: "Push stream"
it took 27L ms (0, 0, 0), result is: 4999950000.0
Running test case: "LINQ"
it took 121L ms (0, 0, 0), result is: 4999950000.0
Running test case: "Seq.init"
it took 1289L ms (661, 0, 0), result is: 4999950000.0
Running test case: "Seq comprehension"
it took 169L ms (0, 0, 0), result is: 4999950000.0
Running test case: "Tail-recursion"
it took 9L ms (0, 0, 0), result is: 4999950000.0
Using outer = 1, inner = 10000000, total is: 10000000
Running test case: "Push stream"
it took 28L ms (0, 0, 0), result is: 4.9999995e+13
Running test case: "LINQ"
it took 121L ms (0, 0, 0), result is: 4.9999995e+13
Running test case: "Seq.init"
it took 1289L ms (661, 0, 0), result is: 4.9999995e+13
Running test case: "Seq comprehension"
it took 169L ms (0, 0, 0), result is: 4.9999995e+13
Running test case: "Tail-recursion"
it took 8L ms (0, 0, 0), result is: 4.9999995e+13
Related
I am currently learning Racket (just for fun) and I stumbled upon this example:
(define doubles
(stream-cons
1
(stream-map
(lambda (x)
(begin
(display "map applied to: ")
(display x)
(newline)
(* x 2)))
doubles)))
It produces 1 2 4 8 16 ...
I do not quite understand how it works.
So it creates 1 as a first element; when I call (stream-ref doubles 1) it creates a second element which is obviously 2.
Then I call (stream-ref doubles 2) which should force creating the third element so it calls stream-map for a stream which already has 2 elements – (1 2) – so it should produce (2 4) then and append this result to the stream.
Why is this stream-map always applied to the last created element? How it works?
Thank you for your help!
This is a standard trick that makes it possible for lazy streams to be defined in terms of their previous element. Consider a stream as an infinite sequence of values:
s = x0, x1, x2, ...
Now, when you map over a stream, you provide a function and produce a new stream with the function applied to each element of the stream:
map(f, s) = f(x0), f(x1), f(x2), ...
But what happens when a stream is defined in terms of a mapping over itself? Well, if we have a stream s = 1, map(f, s), we can expand that definition:
s = 1, map(f, s)
= 1, f(x0), f(x1), f(x2), ...
Now, when we actually go to evaluate the second element of the stream, f(x0), then x0 is clearly 1, since we defined the first element of the stream to be 1. But when we go to evaluate the third element of the stream, f(x1), we need to know x1. Fortunately, we just evaluated x1, since it is f(x0)! This means we can “unfold” the sequence one element at a time, where each element is defined in terms of the previous one:
f(x) = x * 2
s = 1, map(f, s)
= 1, f(x0), f(x1), f(x2), ...
= 1, f(1), f(x1), f(x2), ...
= 1, 2, f(x1), f(x2), ...
= 1, 2, f(2), f(x2), ...
= 1, 2, 4, f(x2), ...
= 1, 2, 4, f(4), ...
= 1, 2, 4, 8, ...
This knot-tying works because streams are evaluated lazily, so each value is computed on-demand, left-to-right. Therefore, each previous element has been computed by the time the subsequent one is demanded, and the self-reference doesn’t cause any problems.
I need to write a program, which returns a new list from a given list with following criteria.
If list member is negative or 0 it should and that value 3 times to new list. If member is positive it should add value 2 times for that list.
For example :
goal: dt([-3,2,0],R).
R = [-3,-3,-3,2,2,0,0,0].
I have written following code and it works fine for me, but it returns true as result instead of R = [some_values]
My code :
dt([],R):- write(R). % end print new list
dt([X|Tail],R):- X =< 0, addNegavite(Tail,X,R). % add 3 negatives or 0
dt([X|Tail],R):- X > 0, addPositive(Tail,X,R). % add 2 positives
addNegavite(Tail,X,R):- append([X,X,X],R,Z), dt(Tail, Z).
addPositive(Tail,X,R):- append([X,X],R,Z), dt(Tail, Z).
Maybe someone know how to make it print R = [] instead of true.
Your code prepares the value of R as it goes down the recursing chain top-to-bottom, treating the value passed in as the initial list. Calling dt/2 with an empty list produces the desired output:
:- dt([-3,2,0],[]).
Demo #1 - Note the reversed order
This is, however, an unusual way of doing things in Prolog: typically, R is your return value, produced in the other way around, when the base case services the "empty list" situation, and the rest of the rules grow the result from that empty list:
dt([],[]). % Base case: empty list produces an empty list
dt([X|Like],R):- X =< 0, addNegavite(Like,X,R).
dt([X|Like],R):- X > 0, addPositive(Like,X,R).
% The two remaining rules do the tail first, then append:
addNegavite(Like,X,R):- dt(Like, Z), append([X,X,X], Z, R).
addPositive(Like,X,R):- dt(Like, Z), append([X,X], Z, R).
Demo #2
Why do you call write inside your clauses?
Better don't have side-effects in your clauses:
dt([], []).
dt([N|NS], [N,N,N|MS]) :-
N =< 0,
dt(NS, MS).
dt([N|NS], [N,N|MS]) :-
N > 0,
dt(NS, MS).
That will work:
?- dt([-3,2,0], R).
R = [-3, -3, -3, 2, 2, 0, 0, 0] .
A further advantage of not invoking functions with side-effects in clauses is that the reverse works, too:
?- dt(R, [-3, -3, -3, 2, 2, 0, 0, 0]).
R = [-3, 2, 0] .
Of cause you can invoke write outside of your clauses:
?- dt([-3,2,0], R), write(R).
[-3,-3,-3,2,2,0,0,0]
R = [-3, -3, -3, 2, 2, 0, 0, 0] .
I encountered a memory problem in Mathematica when I tried to process my experimental data. I'm using Mathematica to find the optimal parameters for a system of three partial differential equations.
When the e parameter was greater than 0.4, Mathematica consumed a lot of memory. For e < 0.4, the program worked properly.
I have tried using $HistoryLength = 0, and reducing AccuracyGoal and WorkingPrecision with no success.
I'm trying to understand what mistakes I made, and how I may limit the memory usage.
Clear[T, L, e, v, q, C0, R, data, model];
T = 13200;
L = 0.085;
e = 0.41;
v = 0.000557197;
q = 0.1618;
C0 = 0.0256;
R = 0.00075;
data = {{L, 600, 0.141124587}, {L, 1200, 0.254134509}, {L, 1800,
0.342888644}, {L, 2400, 0.424476295}, {L, 3600, 0.562844542}, {L,
4800, 0.657111356}, {L, 6000, 0.75137817},
{L, 7200, 0.815876516}, {L, 8430, 0.879823594}, {L, 9000,
0.900771775}, {L, 13200, 1}};
model[(De_)?NumberQ, (Kf_)?NumberQ, (Y_)?NumberQ] :=
model[De, Kf, Y] = yeld /. Last[Last[
NDSolve[{
v D[Ci[z, t], z] + D[Ci[z, t], t] == -((
3 (1 - e) Kf (Ci[z, t] - C0))/(
R e (1 - (R Kf (1 - R/r[z, t]))/De))),
D[r[z, t], t] == (R^2 Kf (Ci[z, t] - C0))/(
q r[z, t]^2 (1 - (R Kf (1 - R/r[z, t]))/De)),
D[yeld[z, t], t] == Y*(v e Ci[z, t])/(L q (1 - e)),
r[z, 0] == R,
Ci[z, 0] == 0,
Ci[0, t] == 0,
yeld[z, 0] == 0},
{r[z, t], Ci[z, t], yeld}, {z, 0, L}, {t, 0, T}]]]
fit = FindFit[
data, {model[De, Kf, Y][z, t], {0.97 < Y < 1.03,
10^-6 < Kf < 10^-4, 10^-13 < De < 10^-9}},
{{De, 10^-12}, {Kf, 10^-6}, {Y, 1}}, {z, t}, Method -> NMinimize]
data = {{600, 0.141124587}, {1200, 0.254134509}, {1800,
0.342888644}, {2400, 0.424476295}, {3600, 0.562844542}, {4800,
0.657111356}, {6000, 0.75137817}, {7200, 0.815876516},
{8430, 0.879823594}, {9000, 0.900771775}, {13200, 1}};
YYY = model[De /. fit[[1]], Kf /. fit[[2]], Y /. fit[[3]]];
Show[Plot[Evaluate[YYY[L, t]], {t, 0, T}, PlotRange -> All],
ListPlot[data, PlotStyle -> Directive[PointSize[Medium], Red]]]
Link to the .nb file: http://www.4shared.com/folder/249TSjlz/_online.html
I have a sneaking suspicion the reason why it's failing on you is because you're caching the results.
Do you need to store every single solution that NDSolve is producing? I'm somewhat skeptical of whether this is useful or not for Findfit, since I highly doubt it would revisit a past result.
Besides, it's not like you're talking about integers where there's a finite domain you're talking about. You're using reals and even over the range you specify, there's A LOT of different solutions possible. I don't think you want to store each and every one of them.
Rewrite your code so that instead of having:
model[(De_)?NumberQ, (Kf_)?NumberQ, (Y_)?NumberQ] :=
model[De, Kf, Y] = yeld /. Last[Last[
NDSolve[..]
You instead have:
model[(De_)?NumberQ, (Kf_)?NumberQ, (Y_)?NumberQ] :=
NDSolve[..]
By caching your previous results, you're going to eat up memory like no tomorrow with FindFit. Typically it's useful when you have a recurrence relation, but here I'd seriously advise against it.
Some Notes:
After running for 2415 seconds on my machine, Mathematica's memory usage went from 112,475,400 bytes to 1,642,280,320 bytes with the caching.
I'm currently running the code without the caching now.
Hello everyone
I have converted a project in C# to F# that paints the Mandelbrot set.
Unfortunately does it take around one minute to render a full screen so I am try to find some ways to speed it up.
It is one call that take almost all of the time:
Array.map (fun x -> this.colorArray.[CalcZ x]) xyArray
xyArray (double * double) [] => (array of tuple of double)
colorArray is an array of int32 length = 255
CalcZ is defined as:
let CalcZ (coord:double * double) =
let maxIterations = 255
let rec CalcZHelper (xCoord:double) (yCoord:double) // line break inserted
(x:double) (y:double) iters =
let newx = x * x + xCoord - y * y
let newy = 2.0 * x * y + yCoord
match newx, newy, iters with
| _ when Math.Abs newx > 2.0 -> iters
| _ when Math.Abs newy > 2.0 -> iters
| _ when iters = maxIterations -> iters
| _ -> CalcZHelper xCoord yCoord newx newy (iters + 1)
CalcZHelper (fst coord) (snd coord) (fst coord) (snd coord) 0
As I only use around half of the processor capacity is an idea to use more threads and specifically Array.Parallel.map, translates to system.threading.tasks.parallel
Now my question
A naive solution, would be:
Array.Parallel.map (fun x -> this.colorArray.[CalcZ x]) xyArray
but that took twice the time, how can I rewrite this to take less time, or can I take some other way to utilize the processor better?
Thanks in advance
Gorgen
---edit---
the function that is calling CalcZ looks like this:
let GetMatrix =
let halfX = double bitmap.PixelWidth * scale / 2.0
let halfY = double bitmap.PixelHeight * scale / 2.0
let rect:Mandelbrot.Rectangle =
{xMax = centerX + halfX; xMin = centerX - halfX;
yMax = centerY + halfY; yMin = centerY - halfY;}
let size:Mandelbrot.Size =
{x = bitmap.PixelWidth; y = bitmap.PixelHeight}
let xyList = GenerateXYTuple rect size
let xyArray = Array.ofList xyList
Array.map (fun x -> this.colorArray.[CalcZ x]) xyArray
let region:Int32Rect = new Int32Rect(0,0,bitmap.PixelWidth,bitmap.PixelHeight)
bitmap.WritePixels(region, GetMatrix, bitmap.PixelWidth * 4, region.X, region.Y);
GenerateXYTuple:
let GenerateXYTuple (rect:Rectangle) (pixels:Size) =
let xStep = (rect.xMax - rect.xMin)/double pixels.x
let yStep = (rect.yMax - rect.yMin)/double pixels.y
[for column in 0..pixels.y - 1 do
for row in 0..pixels.x - 1 do
yield (rect.xMin + xStep * double row,
rect.yMax - yStep * double column)]
---edit---
Following a suggestion from kvb (thanks a lot!) in a comment to my question, I built the program in Release mode. Building in the Relase mode generally speeded up things.
Just building in Release took me from 50s to around 30s, moving in all transforms on the array so it all happens in one pass made it around 10 seconds faster. At last using the Array.Parallel.init brought me to just over 11 seconds.
What I learnt from this is.... Use the release mode when timing things and using parallel constructs...
One more time, thanks for the help I have recieved.
--edit--
by using SSE assember from a native dll I have been able to slash the time from around 12 seconds to 1.2 seconds for a full screen of the most computational intensive points. Unfortunately I don't have a graphics processor...
Gorgen
Per the comment on the original post, here is the code I wrote to test the function. The fast version only takes a few seconds on my average workstation. It is fully sequential, and has no parallel code.
It's moderately long, so I posted it on another site: http://pastebin.com/Rjj8EzCA
I'm suspecting that the slowdown you are seeing is in the rendering code.
I don't think that the Array.Parallel.map function (which uses Parallel.For from .NET 4.0 under the cover) should have trouble parallelizing the operation if it runs a simple function ~1 million times. However, I encountered some weird performance behavior in a similar case when F# didn't optimize the call to the lambda function (in some way).
I'd try taking a copy of the Parallel.map function from the F# sources and adding inline. Try adding the following map function to your code and use it instead of the one from F# libraries:
let inline map (f: 'T -> 'U) (array : 'T[]) : 'U[]=
let inputLength = array.Length
let result = Array.zeroCreate inputLength
Parallel.For(0, inputLength, fun i ->
result.[i] <- f array.[i]) |> ignore
result
As an aside, it looks like you're generating an array of coordinates and then mapping it to an array of results. You don't need to create the coordinate array if you use the init function instead of map: Array.Parallel.init 1000 (fun y -> Array.init 1000 (fun x -> this.colorArray.[CalcZ (x, y)]))
EDIT: The following may be inaccurate:
Your problem could be that you call a tiny function a million times, causing the scheduling overhead to overwhelm that actual work you're doing. You should partition the array into much larger chunks so that each individual task takes a millisecond or so. You can use an array of arrays so that you would call Array.Parallel.map on the outer arrays and Array.map on the inner arrays. That way each parallel operation will operate on a whole row of pixels instead of just a single pixel.
I'm trying to parallelize the element by element multiplication of two matrices in F#. I can't quite figure it out thought. I keep trying to create tasks but it never wants to compile. My non-working messy code is the following:
let myBigElemMultiply (m:matrix) (n:matrix) =
let AddTwoRows (row:int) (destination:matrix) (source1:matrix) (source2:matrix) =
for i in 0 .. destination.NumCols
destination.[row, i] <- source1.[row,i] + source2.[row,i]
destination
let result = Matrix.zero(m.NumRows)
let operations = [ for i in 0 .. m.NumRows -> AddTwoRows i result m n ]
let parallelTasks = Async.Parallel operations
Async.RunSynchronously parallelTasks
result
You have made several small mistakes, e.g., you haven't figured how to do matrix multiplication.
let myBigElemMultiply (m:matrix) (n:matrix) =
let AddTwoRows (row:int) (destination:matrix) (source1:matrix) (source2:matrix) =
for col=0 to destination.NumCols-1 do
let mutable sum = 0.0
for k=0 to m.NumCols-1 do
sum <- sum + source1.[row,k] * source2.[k,col]
destination.[row,col] <- sum
let result = Matrix.zero m.NumRows n.NumCols
let operations = [ for i=0 to m.NumRows-1 do yield async { AddTwoRows i result m n} ]
let parallelTasks = Async.Parallel operations
Async.RunSynchronously parallelTasks |> ignore
result
One thing to notice is that this code would perform very badly because m.[i,j] is an inefficient way to access elements in a matrix. You'd better use a 2D array:
let myBigElemMultiply2 (m:matrix) (n:matrix) =
let AddTwoRows (row:int) (destination:matrix) (source1:matrix) (source2:matrix) =
let destination = destination.InternalDenseValues
let source1 = source1.InternalDenseValues
let source2 = source2.InternalDenseValues
for col=0 to Array2D.length2 destination - 1 do
let mutable sum = 0.0
for k=0 to Array2D.length1 source2 - 1 do
sum <- sum + source1.[row,k] * source2.[k,col]
destination.[row,col] <- sum
let result = Matrix.zero m.NumRows n.NumCols
let operations = [ for i=0 to m.NumRows-1 do yield async { AddTwoRows i result m n} ]
let parallelTasks = Async.Parallel operations
Async.RunSynchronously parallelTasks |> ignore
result
testing:
let r = new Random()
let A = Matrix.init 280 10340 (fun i j -> r.NextDouble() )
let B = A.Transpose
some timing:
> myBigElemMultiply A B;;
Real: 00:00:22.111, CPU: 00:00:41.777, GC gen0: 0, gen1: 0, gen2: 0
val it : unit = ()
> myBigElemMultiply2 A B;;
Real: 00:00:08.736, CPU: 00:00:15.303, GC gen0: 0, gen1: 0, gen2: 0
val it : unit = ()
> A*B;;
Real: 00:00:13.635, CPU: 00:00:13.166, GC gen0: 0, gen1: 0, gen2: 0
val it : unit = ()
>
Check here by using ParallelFor, which should have a better performance than async.
Here's at least some code that compiles, perhaps this will get you headed in the right direction?
let myBigElemMultiply (m:matrix) (n:matrix) =
let AddTwoRows (row:int) (destination:matrix) (source1:matrix) (source2:matrix) =
async {
for i in 0 .. destination.NumCols do
destination.[row, i] <- source1.[row,i] + source2.[row,i]
}
let result = Matrix.zero m.NumRows m.NumCols
let operations = [ for i in 0 .. m.NumRows -> AddTwoRows i result m n ]
let parallelTasks = Async.Parallel operations
Async.RunSynchronously parallelTasks |> ignore
result
There's no point. Out-of-place element-wise multiplication of a pair of matrices is little more that copying at which point a single core will happily max out the entire memory bandwidth of your machine and adding more cores will not improve performance. So it is almost certainly a waste of time.