Chunking a Stream with a custom chunking function - stream

I have a Stream holding an enum of two-element tuples, such as:
[
{:dialogue, %{}},
{:info, %{}},
{:info, %{}},
{:info, %{}},
{:dialogue, %{}},
{:dialogue, %{}},
{:info, %{}}
...
]
And my end goal is to chunk the objects so that each chunk starts with a {:dialogue, %{}} tuple.
Originally I had code like:
stream
|> Stream.chunk_by(fn {type, _} -> type end) # [[dialogue], [info, info], [dialogue]...]
|> Stream.chunk(2) # [[[dialogue], [info, info, info]], [[dialogue], ...]]
But I soon realized that this fell down when there were two dialogue tuples in a row - the pair chunking no longer gave the expected outcome.
Ideally I'd like to create some method that works like:
chunk_when(list, fn({type, record}) -> type == :dialogue end) |> Enum.to_list
=> [[dialogue, info, info, info], [dialogue], [dialogue, info]...]
But I'm stuck on how to do this with the Stream module.
There must be some invocation of Stream.transform/4 or Stream.resource/3 that would make it work, but I can't figure it out.
This question is the same idea, but it works with lists and not streams - which have a different API.

Stream.transform/3 comes to the rescue:
defmodule A do
#input [
{:dialogue, %{}},
{:info, %{}},
{:info, %{}},
{:info, %{}},
{:dialogue, %{}},
{:dialogue, %{}},
{:info, %{}}]
def chunk_when(input \\ #input, type \\ :dialogue) do
input
|> Stream.map(& &1)
|> Stream.concat([nil]) # bah!
|> Stream.transform([], fn e, acc ->
case e do
nil -> {[acc], nil} # bah!
{^type, _} -> {(if Enum.empty?(acc), do: [], else: [acc]), [e]}
{_, _} -> {[], acc ++ [e]}
end
end)
end
end
IO.inspect Enum.to_list(A.chunk_when())
#⇒ [[dialogue: %{}, info: %{}, info: %{}, info: %{}],
# [dialogue: %{}],
# [dialogue: %{}, info: %{}]]
I am open to suggestions on how to make it more elegant in two obviously dirty places: how to not append nil to input to catch the last chunk and how to avoid stupid if for the very first :dialogue occurrence.

Related

What can cause SystemLimitError when calling timer.tc?

I am writing a simple module in Elixir that loads data into a gen_server written in Erlang. I need to measure time of loading operations, and this is the thing I have problem with. When I call :timer.tc() on line 46 (the one with crateStations) I get SystemLimitError, I have no idea what can cause such behaviour and would be grateful for any tips. Bellow is the code for elxir module used, I have worked with the Erlang gen_server previously and no such errors occurred.
defmodule Data do
defp nameStation({:cords,lat,lang}) do
"station_#{lang}_#{lat}}}"
end
defp identifyStations(data) do
data |> Enum.map((&(&1.location))) |> Enum.uniq |> Enum.map((fn({lat,lang})->{:cords,lat,lang}end))
end
defp createStations(data)do
identifyStations(data) |>
Enum.each((fn (cords)->:pollution_gen_server.addStation(nameStation(cords),cords) end))
end
defp createMesurements(data) do
data |> Enum.each((fn(value)->
:pollution_gen_server.addValue(value.location,value.datetime,"PM10",value.pollutionLevel ) end))
end
defp importLinesFromCSV(path) do
File.read!(path) |> (&(String.split(&1, "\r\n"))).()
end
defp parse(line) do
[date_l,time_l,lang_l,lat_l,polltion_l] = String.split(line,",")
lat = lat_l |> Float.parse() |> elem(0)
lang = lang_l |> Float.parse() |> elem(0)
pollution = polltion_l |> Integer.parse() |> elem(0)
{hours,mins} = time_l |> String.split(":") |> Enum.map(&(Integer.parse/1)) |>Enum.map(&(elem(&1,0))) |>
:erlang.list_to_tuple()
date = date_l |> String.split(":") |> Enum.map(&(Integer.parse/1)) |> Enum.reverse() |>Enum.map(&(elem(&1,0))) |>
:erlang.list_to_tuple()
%{
:datetime => {date,{hours,mins,0}},
:location => {lat,lang},
:pollutionLevel => pollution
}
end
def run(path) do
:pollution_sup.start_link()
lines = importLinesFromCSV(path) |> Enum.map(&parse/1)
time_stations = :timer.tc(&createStations/1,lines) |> elem(0) |> Kernel./(1_000_000)
time_measurements = :timer.tc(&createMesurements/1,lines) |> elem(0) |> Kernel./(1_000_000)
time_mean = :timer.tc(&:pollution_gen_server.getStationMean/2,["PM10",{:cords, 49.986, 20.06}]) |> elem(0) |> Kernel./(1_000_000)
mean = :pollution_gen_server.getStationMean("PM10",{:cords, 49.986, 20.06})
time_daily = :timer.tc(&:pollution_gen_server.getDailyMean/2,["PM10",{2017, 5, 3}]) |> elem(0) |> Kernel./(1_000_000)
daily = :pollution_gen_server.getDailyMean("PM10",{2017, 5, 3})
IO.puts "Time of loading stations: #{time_stations}"
IO.puts "Time of loading mesurements: #{time_measurements}"
IO.puts "Time of getting mean: #{time_mean} result: #{mean}"
IO.puts "Time of getting daily: #{time_daily} result: #{daily}"
end
end
Does the call :pollution_gen_server.addStation(nameStation(cords),...) create an atom from the name? In that case, you could be overflowing the atom table, which by default has room for 1048576 unique atoms (i.e., just over a million). If you can't rewrite the code, you could try raising the level with the +t flag when starting the system.

Streaming: VM freezes for large file

I am building a pipeline to process, aggregate and transform data from csv files, then write back to another csv file… I load rows from a 19 column csv file, and with some mathematical operations (map reduce style) write back 30 columns in another csv.
And it was going fine until I tried to upload a 25mb file to the application, 250000 rows, and then I decided to stream all the operations instead of eagerly processing… but now that I’m changing function by function with streams, I faced a problem that I don’t understand why, after only 5 fields created, when I try to write to file the program just freezes and stops writing after a few thousand lines.
I’m streaming every single function so it shouldn’t have any locks as far as I understand, and for the first thousands writes it works fine so I wonder what’s happening, in the erlang observer I can only see usage of resources dropped to near 0 and it doesn’t write to the file anymore.
This is my stream function (before I just load from file), and next is my write function:
def process(stream, field_longs_lats, team_settings) do
main_stream =
stream
# Removing once that don't have timestamp
|> Stream.filter(fn [time | _tl] -> time != "-" end)
# Filter all duplicated rows by timestamp
|> Stream.uniq_by(fn [time | _tl] -> time end)
|> Stream.map(&Transform.apply_row_tranformations/1)
cumulative_milli =
main_stream
|> Stream.map(fn [_time, milli | _tl] -> milli end)
|> Statistics.cumulative_sum()
speeds =
main_stream
|> Stream.map(fn [_time, _milli, _lat, _long, pace | _tl] ->
pace
end)
|> Stream.map(&Statistics.get_speed/1)
cals = Motion.calories_per_timestep(cumulative_milli, cumulative_milli)
long_stream =
main_stream
|> Stream.map(fn [_time, _milli, lat | _tl] -> lat end)
lat_stream =
main_stream
|> Stream.map(fn [_time, _milli, _lat, long | _tl] -> long end)
x_y_tuples =
RelativeCoordinates.relative_coordinates(long_stream, lat_stream, field_longs_lats)
x = Stream.map(x_y_tuples, fn {x, _y} -> x end)
y = Stream.map(x_y_tuples, fn {_x, y} -> y end)
[x, y, cals, long_stream, lat_stream]
end
write:
def write_to_file(keyword_list, file_name) do
file = File.open!(file_name, [:write, :utf8])
IO.write(file, V4.empty_v4_headers() <> "\n")
keyword_list
|> Stream.zip()
|> Stream.each(&write_tuple_row(&1, file))
|> Stream.run()
File.close(file)
end
#spec write_tuple_row(tuple(), pid()) :: :ok
def write_tuple_row(tuple, file) do
IO.inspect("writing #{inspect(tuple)}")
row_content =
Tuple.to_list(tuple)
|> Enum.map_join(",", fn value -> Transformations.to_string(value) end)
IO.write(file, row_content <> "\n")
end

Elixir Queue (Erlang :queue) - Enum.Take

Going through the docs of the erlang queue here: http://erlang.org/doc/man/queue.html#member-2
I don't see a way to pull off a range of items like Enum.Take. Has anyone solved this?
Erlang is proud of using recursion wherever possible instead of imperative calls. The desired behaviour might be easily implemented:
def take(q, amount), do: do_take(q, {amount, []})
defp do_take(q, {n, acc}) when n > 0 do
case :queue.out(q) do
{{:value, e}, rest} ->
do_take(rest, {n - 1, [e | acc]})
{:empty, q} -> {acc, q}
end
end
defp do_take(q, {_, acc}), do: {acc, q}
I have not tested this code, but I believe the idea is clear.
Or, with a fancy else syntax:
def take(q, amount), do: do_take(q, {amount, []})
defp do_take(q, {n, acc}) when n > 0 do
:queue.out(q)
else
{{:value, e}, rest} -> do_take(rest, {n - 1, [e | acc]})
{:empty, q} -> {acc, q}
end
defp do_take(q, {_, acc}), do: {acc, q}
I don't see a way to pull off a range of items like Enum.Take. Has
anyone solved this?
Yep. From the page you linked:
split(N :: integer() >= 0, Q1 :: queue(Item)) ->
{Q2 :: queue(Item), Q3 :: queue(Item)}
Splits Q1 in two. The N front items are put in Q2 and the rest in Q3.
So, you can do this:
-module(my).
-compile(export_all).
take(N, Q) ->
{Taken, _Rest} = queue:split(N, Q),
Taken.
In the shell:
1> c(my).
my.erl:2: Warning: export_all flag enabled - all functions will be exported
{ok,my}
2> Q = queue:from_list([1,2,3,4]).
{[4],[1,2,3]}
3> T1 = my:take(1, Q).
{[],[1]}
4> queue:to_list(T1).
[1]
5> T3 = my:take(3, Q).
{[3],[1,2]}
6> queue:to_list(T3).
[1,2,3]
All operations have an amortized O(1) running time, except filter/2,
join/2, len/1, member/2, split/2 that have O(n).

Elixir: Stream.drop_while generalisation?

I'd like to generalise this code but can't see how to do so in a neat way:
defmodule Demo do
defp side_effect(bool, label) do
if bool do
IO.puts label
end
end
def go do
{a,b,c} = {4,8,13}
[2,3,4,5,7,8,9,10,11,12] # always in ascending order
|> Stream.drop_while( fn i -> bool = (i<a); side_effect(bool, "a_#{i}"); bool end )
|> Stream.drop_while( fn i -> bool = (i<b); side_effect(bool, "b_#{i}"); bool end )
|> Stream.drop_while( fn i -> bool = (i<c); side_effect(bool, "c_#{i}"); bool end )
|> Enum.to_list
end
end
When this runs (Demo.go) I get:
a_2
a_3
b_4
b_5
b_7
c_8
c_9
c_10
c_11
c_12
[]
just as I wished - getting a side effect executed for each element in the input list, with an empty list as final output.
But is it possible to generalise this so I can programmatically include (based on a list) as many lines as I like such a:
|> Stream.drop_while( fn i -> bool = (i<x); side_effect(bool, "x_#{i}"); bool end )
I was hoping not to explore macros if I can help it.
A stream is a data structure, which means you can reduce over it, refining it to a particular set of values on every step:
defmodule Demo do
defp side_effect(var, threshold, label) do
if var < threshold do
IO.puts "#{label}_#{var}"
true
else
false
end
end
def go do
values = [a: 4, b: 8, c: 13]
stream =
Enum.reduce(values, [2,3,4,5,7,8,9,10,11,12], fn {k, v}, acc ->
Stream.drop_while(acc, fn i -> side_effect(i, v, k) end)
end)
Enum.to_list(stream)
end
end
You can also explore other solutions. For example, instead of creating a stream for each part of value, you can simple do a filter operation that checks if the value is less than a threshold in a list. Something like this:
defmodule Demo do
defp side_effect(i, values) do
pair = Enum.find(values, fn {_, v} -> i < v end)
case pair do
{k, v} ->
IO.puts "#{k}_#{i}"
false
nil ->
true
end
end
def go do
values = [a: 4, b: 8, c: 13]
[2,3,4,5,7,8,9,10,11,12]
|> Stream.filter(fn i -> side_effect(i, values) end)
|> Enum.to_list()
end
end
I am not sure if you actually need to filter. If not, Stream.map/2 or Stream.each/2 (which is exclusive for side-effects) would work better.

How can I create a sequence of registered process names in Erlang?

At the moment I am using the following to create and register processes individually:
register(name, spawn(fun() -> myfun())).
I would like to create a list of N registered Pids with names as follows:
pid1
pid2
pid3
.
.
.
pidN
Can anyone recommend a way of doing this?
If you want to spawn multiple processes for the same function you can use a list comprehension and use concatenation of strings and list_to_atom/1:
[register(list_to_atom("pid" ++ integer_to_list(X)), spawn(fun() -> myFun() end)) || X <- lists:seq(1,10)].
You might use following:
if you want to execute same function in multiply processes and register each process under different names:
lists:foreach(
fun( Name ) ->
register( Name, spawn( fun() -> myfun() end ) )
end,
[ pid1, pid2, pid3 ] ).
or if you want to execute different functions in different processes:
lists:foreach(
fun( { Name, Func } ) ->
register( Name, spawn( fun() -> Func() end ) )
end,
[ { pid1, f1 }, { pid2, f2 }, { pid3, f3 } ] ).
( f1, f2, f3 - are functions )
and finally, if you want to execute same function in N processes, you might do next:
N = 20, %number of processes
lists:map(
fun( Num ) ->
register(
list_to_atom( "pid" ++ integer_to_list( Num ) ),
spawn( fun() -> myfun() end ) )
end,
lists:seq( 1, N ) ).

Resources