Code review of CYK algorithm implementation in Erlang - erlang

I am beginning Erlang and as an exercise I tried to implement the CYK algorithm.
Main code(cyk.erl):
%%% An attempt for a CYK parser in Erlang
-module(cyk).
-export([
init_storage/0,
import_grammar_file/1,
add_grammar_rule/1,
analyze/1,
test_analyze/0
]).
%% Initialize the ets storage for grammar
init_storage() ->
ets:new(?MODULE, [bag, named_table]).
%%------------------------------------------
%%
%% Grammar
%%
%%------------------------------------------
%% Import a grammar file
import_grammar_file(File) ->
{ok, Device} = file:open(File, read),
import_file_rules(Device).
%% Import all the rules in the file
import_file_rules(Device) ->
case io:get_line(Device, "") of
eof ->
io:format("Grammar file imported~n"),
file:close(Device);
Line ->
add_grammar_rule(Line),
import_file_rules(Device)
end.
%% Add a grammar rule
add_grammar_rule(Rule) ->
case re:run(Rule, "^([^\s]+)\s?->\s?([^\n]+)$", [{capture, all_but_first, binary}]) of
{match, [A, B]} ->
ets:insert(?MODULE, {A, B}),
io:format("parsing ~p -> ~p~n", [A, B]);
nomatch ->
io:format("cannot parse ~p~n", [Rule])
end.
%%------------------------------------------
%%
%% Main logic
%%
%%------------------------------------------
%% Analyze a sentence
analyze(Sentence) ->
io:format("analysing: ~p~n", [Sentence]),
WordList = re:split(Sentence, " "),
io:format("wordlist: ~p~n", [WordList]),
Representation = lists:map( fun(Word) -> associate(Word) end, WordList),
io:format("representation: ~p~n", [Representation]),
Result = process([Representation]),
io:format("result: ~p~n", [Result]).
% associate sentence words with grammar terms
associate(Word) ->
case ets:match(cyk, {'$1', Word}) of
[H|T] -> lists:flatten([H|T]);
[] -> []
end.
% process sentence representation
process(Representation) ->
Limit = length(lists:last(Representation)),
process(Representation, Limit).
process(Representation, Limit) when Limit > 1 ->
NextStep = process(Representation, 1, Limit-1, []),
process([NextStep|Representation], Limit-1);
process(Representation, _Limit) ->
Representation.
process(Representation, Index, Limit, Acc) when Index =< Limit ->
Subtree = extract_subtree(lists:reverse(Representation), Index),
Result = process_subtree(Subtree),
process(Representation, Index+1, Limit, [Result|Acc]);
process(_Representation, _Index, _Limit, Acc) ->
lists:reverse(Acc).
%%------------------------------------------
%%
%% Subtree
%%
%%------------------------------------------
process_subtree(Subtree) ->
process_subtree(Subtree, Subtree, [], 1).
process_subtree([], _Subtree, Acc, _Index) ->
Acc;
process_subtree([H|T], Subtree, Acc, Index) ->
A = lists:nth(1,H),
Bind = length( Subtree ) - Index + 1,
B = lists:last( lists:nth( Bind, Subtree) ),
% generating the possibilities of grammar
Pos = [ list_to_binary(binary:bin_to_list(X)++" "++binary:bin_to_list(Y)) || X<-A, Y<-B ],
% looking up in the grammar
Result = lists:flatten( [ ets:match(cyk, {'$1', X}) || X <- Pos ] ),
process_subtree(T, Subtree, Acc++Result, Index + 1).
%% Extract a subtree from the representation
extract_subtree(Representation, Position) ->
Size = length(Representation) + 1,
extract_subtree(Representation, Size, Position, []).
extract_subtree([], _Size, _Position, Acc) ->
lists:reverse(Acc);
extract_subtree([H|T], Size, Position, Acc) ->
Segment = lists:sublist(H, Position, Size),
extract_subtree(T, Size - 1, Position, [Segment|Acc]).
%%------------------------------------------
%%
%% Test
%% using the same example as
%% http://en.wikipedia.org/wiki/CYK_algorithm
%%
%%------------------------------------------
test_analyze() ->
init_storage(),
import_grammar_file("grammar.txt"),
analyze("she eats a fish with a fork").
The grammar file (grammar.txt)
S -> NP VP
VP -> VP PP
VP -> V NP
VP -> eats
PP -> P NP
NP -> Det N
NP -> she
V -> eats
P -> with
N -> fish
N -> fork
Det -> a
The code can be tested from the erlang shell
> c(cyk).
> cyk:test_analyze().
parsing <<"S">> -> <<"NP VP">>
parsing <<"VP">> -> <<"VP PP">>
parsing <<"VP">> -> <<"V NP">>
parsing <<"VP">> -> <<"eats">>
parsing <<"PP">> -> <<"P NP">>
parsing <<"NP">> -> <<"Det N">>
parsing <<"NP">> -> <<"she">>
parsing <<"V">> -> <<"eats">>
parsing <<"P">> -> <<"with">>
parsing <<"N">> -> <<"fish">>
parsing <<"N">> -> <<"fork">>
parsing <<"Det">> -> <<"a">>
Grammar file imported
analysing: "she eats a fish with a fork"
wordlist: [<<"she">>,<<"eats">>,<<"a">>,<<"fish">>,<<"with">>,<<"a">>,
<<"fork">>]
representation: [[<<"NP">>],
[<<"VP">>,<<"V">>],
[<<"Det">>],
[<<"N">>],
[<<"P">>],
[<<"Det">>],
[<<"N">>]]
result: [[[<<"S">>]],
[[],[<<"VP">>]],
[[],[],[]],
[[<<"S">>],[],[],[]],
[[],[<<"VP">>],[],[],[<<"PP">>]],
[[<<"S">>],[],[<<"NP">>],[],[],[<<"NP">>]],
[[<<"NP">>],
[<<"VP">>,<<"V">>],
[<<"Det">>],
[<<"N">>],
[<<"P">>],
[<<"Det">>],
[<<"N">>]]]
The code seems to work fine for this example, but I was looking for ways to improve it (make it more erlang-ish) and specially to make the processing distributed on multiple process/nodes.
I guess all the process_subtree executions for each step could be done concurrent, but I can't really figure how.
Any suggestions will be greatly appreciated!

I have written this solution which use concurrent execution.
Compare to Eric solution, some changes were needed for the usage of multi-processes, some other because I think it is more efficient (I reverted keys and values in the rules ets, and I have chosen a set), some because I think it is cleaner (I close the grammar file in the function that open it) and some because I am more familiar with these modules (string:tokens ...).
[edit]
I have replaced a useless spawn by faster recursive call, and suppressed the wait function by adding a message to synchronize the processes.
I got the idea of this implementation looking at the nice animation at a Javascript animation of the CYK algorithm, which is unfortunately no longer available.
#Eric, it is possible to look at all steps of the analysis opening the ets analyze with observer, it is why I do not delete it.
-module(cyk).
-export([
import_grammar_file/1,
add_grammar_rule/2,
analyze/1,
test_analyze/1,
test_analyze/0
]).
%%------------------------------------------
%%
%% Grammar
%%
%%------------------------------------------
%% Import a grammar file
import_grammar_file(File) ->
reset_ets(rules, ets:info(rules)),
{ok, Device} = file:open(File, read),
ok = add_grammar_rule(Device,file:read_line(Device)),
file:close(Device),
io:format("Grammar file imported~n").
%% Add a grammar rule
add_grammar_rule(_,eof) -> ok;
add_grammar_rule(Device,{ok,Rule}) ->
[T,"->",H|Q] = string:tokens(Rule," \n"),
Key = key(H,Q),
insert(Key,T,ets:lookup(rules, Key)),
add_grammar_rule(Device,file:read_line(Device)).
key(H,[]) -> H;
key(H,[Q]) -> {H,Q}.
insert(Key,T,[]) -> ets:insert(rules, {Key,[T]});
insert(Key,T,[{Key,L}]) -> ets:insert(rules, {Key,[T|L]}).
%%------------------------------------------
%%
%% Main logic
%%
%%------------------------------------------
%% Analyze a sentence
analyze(Sentence) ->
reset_ets(analyze, ets:info(analyze)),
io:format("analysing: ~p~n", [Sentence]),
WordList = string:tokens(Sentence, " "),
Len = length(WordList),
Me = self(),
lists:foldl(fun(X,{J,Pid}) -> ets:insert(analyze,{{0,J},ets:lookup_element(rules,X,2)}),
(NewPid = spawn(fun() -> whatis(1,J,Len,Pid,Me) end)) ! {done,0},
{J+1,NewPid} end,
{1,none}, WordList),
receive
M -> M
end.
reset_ets(Name, undefined) -> ets:new(Name,[set, named_table,public]);
reset_ets(Name, _) -> ets:delete_all_objects(Name).
whatis(Len,1,Len,_,PidRet) -> PidRet ! ets:lookup_element(analyze,{Len-1,1},2); % finished
whatis(I,J,Len,_,_) when I + J == Len +1 -> ok; % ends useless processes
whatis(I,J,Len,Pid,PidRet) ->
receive {done,V} when V == I-1 -> ok end,
Cases = lists:map(fun({X,Y}) -> [{A,B} || A <- ets:lookup_element(analyze,X,2),
B <- ets:lookup_element(analyze,Y,2)] end,
[{{X-1,J},{I-X,J+X}} || X <- lists:seq(1,I)]),
Val = lists:foldl(fun(X,Acc) -> case ets:lookup(rules,X) of
[] -> Acc;
[{_,[R]}] -> [R|Acc]
end end,
[],lists:flatten(Cases)),
ets:insert(analyze,{{I,J},Val}),
send(Pid,I),
whatis(I+1,J,Len,Pid,PidRet).
send(none,_) -> ok;
send(Pid,I) -> Pid ! {done,I}.
%%------------------------------------------
%%
%% Test
%% using the same example as
%% http://en.wikipedia.org/wiki/CYK_algorithm
%%
%%------------------------------------------
test_analyze(S) ->
import_grammar_file("grammar.txt"),
analyze(S).
test_analyze() ->
test_analyze("she eats a fish with a fork").

Related

How to speed up "Queue using Two Stacks" in Erlang [duplicate]

This question already has answers here:
How to efficiently read thousand of lines from STDIN in Erlang?
(2 answers)
Closed 2 years ago.
A common programming problem which I've done in Python, Java, etc. with no problem. The following in Erlang (which I'm just learning) runs very slowly (~44s user time for 10^5 operations), and I am lost as to why.
As written on HackerRank, the program takes one line from stdin with an integer representing the number of operations that follow. Each subsequent line should be 1 X (enqueue X), 2 (dequeue and discard), or 3 (peek and print the next value on the queue).
Am I using lists:reverse/1 incorrectly?
-module(two_stacks).
%% API exports
-export([main/1]).
enqueue(Num, F, B) ->
{[Num | F], B}.
dequeue(F, []) ->
[_|B] = lists:reverse(F),
{[], B};
dequeue(F, [_|B]) ->
{F, B}.
peek(F, []) ->
[H|T] = lists:reverse(F),
io:format(H),
{[], [H|T]};
peek(F, [H|T]) ->
io:format(H),
{F, [H|T]}.
dispatchOperation(_, {F, B}) ->
[Code|Line] = io:get_line(""),
case Code of
49 ->
[_|Num] = Line,
enqueue(Num, F, B);
50 -> dequeue(F, B);
51 -> peek(F, B)
end.
main(_) ->
{Count, _} = string:to_integer(io:get_line("")),
_ = lists:foldl(fun dispatchOperation/2, {[], []}, lists:seq(1, Count)),
erlang:halt(0).
https://www.hackerrank.com/challenges/queue-using-two-stacks/problem
Are you running an escript? If that's the case, you should add a -mode(compile). there, because otherwise it runs the script in interpreted mode.
Also, you can compare the times against using the queue module (which is implemented using two stacks)
The issue is with the way I'm parsing input. See 46493207 for discussion.
Since all the inputs are integers I was able to make use of the same technique used there. The completed code is:
-module(solution).
-export([main/0]).
enqueue(Num, F, B) ->
{[Num | F], B}.
dequeue(F, []) ->
[_|B] = lists:reverse(F),
{[], B};
dequeue(F, [_|B]) ->
{F, B}.
peek(F, []) ->
[H|T] = lists:reverse(F),
io:format("~B~n", [H]),
{[], [H|T]};
peek(F, [H|T]) ->
io:format("~B~n", [H]),
{F, [H|T]}.
run(_, {F, B}) ->
Line = io:get_line(""),
[X| Y] = binary:split(Line, [<<$\s>>, <<$\n>>], [global]),
Code = binary_to_integer(X),
case Code of
1 ->
Num = binary_to_integer(lists:nth(1, Y)),
enqueue(Num, F, B);
2 -> dequeue(F, B);
3 -> peek(F, B)
end.
main() ->
ok = io:setopts(standard_io, [binary]),
{Count, _} = string:to_integer(io:get_line("")),
_ = lists:foldl(fun run/2, {[], []}, lists:seq(1, Count)),
erlang:halt(0).

Erlang repetition string in string

I have a string:
"abc abc abc abc"
How do I calculate the number of "abc" repetitions?
If you are looking for practical and efficient implementation which will scale well for even longer substrings you can use binary:matches/2,3 which is using Boyer–Moore string search algorithm (and Aho-Corasic for multiple substrings). It obviously works only for ASCII or Latin1 strings.
repeats(L, S) -> length(binary:matches(list_to_binary(L), list_to_binary(S))).
If it is for education purposes, you can write your own less efficient version for lists of any kind. If you know substring in compile time you can use very simple and not so much bad in performance:
-define(SUBSTR, "abc").
repeats(L) -> repeats(L, 0).
repeats(?SUBSTR ++ L, N) -> repeats(L, N+1);
repeats([_|L] , N) -> repeats(L, N);
repeats([] , N) -> N.
If you don't know substring you can write a little bit more complicated and less efficient
repeats(L, S) -> repeats(L, S, 0).
repeats([], _, N) -> N;
repeats(L, S, N) ->
case prefix(L, S) of
{found, L2} -> repeats( L2, S, N+1);
nope -> repeats(tl(L), S, N)
end.
prefix([H|T], [H|S]) -> prefix(T, S);
prefix( L, [ ]) -> {found, L};
prefix( _, _ ) -> nope.
And you, of course, can try write some more sophisticated variant as simplified Boyer–Moore for lists.
1> F = fun
F([],_,_,N) -> N;
F(L,P,S,N) ->
case string:sub_string(L,1,S) == P of
true -> F(tl(string:sub_string(L,S,length(L))),P,S,N+1);
_ -> F(tl(L),P,S,N)
end
end.
#Fun<erl_eval.28.106461118>
2> Find = fun(L,P) -> F(L,P,length(P),0) end.
#Fun<erl_eval.12.106461118>
3> Find("abc abc abc abc","abc").
4
4>
this works if defined in a module, or in the shell but only with the R17.
length(lists:filter(fun(X) -> X=="abc" end, string:tokens("abc abc abc abc", " "))).

How can I read the return value of add(N) function in my calc function?

I am new to Erlang, I need to spawn two process running add function, then add the
two numbers.
assigned value of Process one and two is showing the process id, I need catch the value.
How can I read the return value of add(N) function in my calc function?
-module(myerl).
-export([calc/1,add/1]).
add(N) ->
N + 5.
calc(L)
pone = spawn( fun() -> add(A) end),
ptwo = spawn( fun() -> add(B) end),
Result = Pone + Ptwo,
io:format("result ~p~n", [Result]).
You need to use message passing. You must send a message back to the calling process with the result. The spawn function returns a PID (process identifier) to the newly spawned process, not the result of its execution.
This example should do what you're expecting:
calc(A, B) ->
Self = self(), % The spawned funs need a Pid to send to, use a closure
POne = spawn(fun() -> Self ! {self(), add(A)} end),
PTwo = spawn(fun() -> Self ! {self(), add(B)} end),
wait_for_response(POne, PTwo, 0).
wait_for_response(undefined, undefined, Sum) ->
Sum;
wait_for_response(POne, PTwo, Sum) ->
receive
{POne, V} -> wait_for_response(undefined, PTwo, Sum + V);
{PTwo, V} -> wait_for_response(POne, undefined, Sum + V)
end.
#Soup d'Campbells' explanation is good. I instinctively did something slightly different which, in a toy way, anticipates some bad behavior with the child processes. Also, I allow the input to be a list of numbers, not just 2.
-module(myerl).
-export([calc/1, add/1]).
calc(NumList) when is_list(NumList)->
Parent = self(),
_Pids = [spawn(fun()-> Parent ! add(ANum) end) || ANum <- NumList],
collect(length(NumList), 0);
calc(_) ->
{error, badarg}.
collect(0, Sum) ->
Sum;
collect(Cnt, Sum) ->
receive
N when is_number(N) ->
collect(Cnt-1, Sum + N);
_Bad -> % returned something that isnt a number
collect(Cnt-1, Sum)
after 1000 -> % died or is too slow
collect(Cnt-1, Sum)
end.
add(N) ->
N + 5.

Splitting a list in equal sized chunks in Erlang

I want to split:
[1,2,3,4,5,6,7,8]
into:
[[1,2],[3,4],[5,6],[7,8]]
It generally works great with:
[ lists:sublist(List, X, 2) || X <- lists:seq(1,length(List),2) ] .
But it is really slow this way. 10000 Elements take amazing 2.5 seconds on my netbook. I have also written a really fast recursive function, but I am simply interested: Could this list comprehension also be written in a different way, so that it is faster?
Try this:
part(List) ->
part(List, []).
part([], Acc) ->
lists:reverse(Acc);
part([H], Acc) ->
lists:reverse([[H]|Acc]);
part([H1,H2|T], Acc) ->
part(T, [[H1,H2]|Acc]).
Test in erlang-shell (I've declared this function in module part):
2> part:part([1,2,3,4,5,6,7,8]).
[[1,2],[3,4],[5,6],[7,8]]
3>
3> timer:tc(part, part, [lists:seq(1,10000)]).
{774,
[[1,2],
[3,4],
[5,6],
[7,8],
"\t\n","\v\f",
[13,14],
[15,16],
[17,18],
[19,20],
[21,22],
[23,24],
[25,26],
[27,28],
[29,30],
[31,32],
"!\"","#$","%&","'(",")*","+,","-.","/0","12","34",
[...]|...]}
Just 774 microseconds (which is ~0,8 milliseconds)
Here are two quick solutions for you that are both flexible. One is easy to read, but only slightly faster than your proposed solution. The other is quite fast, but is a bit cryptic to read. And note that both of my proposed algorithms will work for lists of anything, not just numeric ordered lists.
Here is the "easy-to-read" one. Call by n_length_chunks(List,Chunksize). For example, to get a list of chunks 2 long, call n_length_chunks(List,2). This works for chunks of any size, ie, you could call n_length_chunks(List,4) to get [[1,2,3,4],[5,6,7,8],...]
n_length_chunks([],_) -> [];
n_length_chunks(List,Len) when Len > length(List) ->
[List];
n_length_chunks(List,Len) ->
{Head,Tail} = lists:split(Len,List),
[Head | n_length_chunks(Tail,Len)].
The much faster one is here, but is definitely harder to read, and is called in the same way: n_length_chunks_fast(List,2) (I've made one change to this compared with the one above, in that it pads the end of the list with undefined if the length of the list isn't cleanly divisible by the desired chunk length.
n_length_chunks_fast(List,Len) ->
LeaderLength = case length(List) rem Len of
0 -> 0;
N -> Len - N
end,
Leader = lists:duplicate(LeaderLength,undefined),
n_length_chunks_fast(Leader ++ lists:reverse(List),[],0,Len).
n_length_chunks_fast([],Acc,_,_) -> Acc;
n_length_chunks_fast([H|T],Acc,Pos,Max) when Pos==Max ->
n_length_chunks_fast(T,[[H] | Acc],1,Max);
n_length_chunks_fast([H|T],[HAcc | TAcc],Pos,Max) ->
n_length_chunks_fast(T,[[H | HAcc] | TAcc],Pos+1,Max);
n_length_chunks_fast([H|T],[],Pos,Max) ->
n_length_chunks_fast(T,[[H]],Pos+1,Max).
Tested on my (really old) laptop:
Your proposed solution took about 3 seconds.
My slow-but-readable one was slightly faster and takes about 1.5 seconds (still quite slow)
My fast version takes about 5 milliseconds.
For completeness, Isac's solution took about 180 milliseconds on my same machine.
Edit: wow, I need to read the complete question first. Oh well I'll keep here for posterity if it helps. As far as I can tell, there's not a good way to do this using list comprehensions. Your original version is slow because each iteration of sublist needs to traverse the list each time to get to each successive X, resulting in complexity just under O(N^2).
Or with a fold:
lists:foldr(fun(E, []) -> [[E]];
(E, [H|RAcc]) when length(H) < 2 -> [[E|H]|RAcc] ;
(E, [H|RAcc]) -> [[E],H|RAcc]
end, [], List).
I want to submit slightly complicated but more flexible (and mostly faster) solution of one proposed by #Tilman
split_list(List, Max) ->
element(1, lists:foldl(fun
(E, {[Buff|Acc], C}) when C < Max ->
{[[E|Buff]|Acc], C+1};
(E, {[Buff|Acc], _}) ->
{[[E],Buff|Acc], 1};
(E, {[], _}) ->
{[[E]], 1}
end, {[], 0}, List)).
so function part can be implemented as
part(List) ->
RevList = split_list(List, 2),
lists:foldl(fun(E, Acc) ->
[lists:reverse(E)|Acc]
end, [], RevList).
update
I've added reverse in case if you want to preserve order, but as I can see it adds no more than 20% of processing time.
You could do it like this:
1> {List1, List2} = lists:partition(fun(X) -> (X rem 2) == 1 end, List).
{[1,3,5|...],[2,4,6|...]}
2> lists:zipwith(fun(X, Y) -> [X, Y] end, List1, List2).
[[1,2],[3,4],[5,6]|...]
This takes ~73 milliseconds with a 10000 elements List on my computer. The original solution takes ~900 miliseconds.
But I would go with the recursive function anyway.
I was looking for a partition function which can split a large list to small amount of workers. With lkuty's partition you might get that one worker gets almost double work than all the others. If that's not what you want, here is a version which sublist lengths differ by at most 1.
Uses PropEr for testing.
%% #doc Split List into sub-lists so sub-lists lengths differ most by 1.
%% Does not preserve order.
-spec split_many(pos_integer(), [T]) -> [[T]] when T :: term().
split_many(N, List) ->
PieceLen = length(List) div N,
lists:reverse(split_many(PieceLen, N, List, [])).
-spec split_many(pos_integer(), pos_integer(), [T], [[T]]) ->
[[T]] when T :: term().
split_many(PieceLen, N, List, Acc) when length(Acc) < N ->
{Head, Tail} = lists:split(PieceLen, List),
split_many(PieceLen, N, Tail, [Head|Acc]);
split_many(_PieceLen, _N, List, Acc) ->
% Add an Elem to each list in Acc
{Appendable, LeaveAlone} = lists:split(length(List), Acc),
Appended = [[Elem|XS] || {Elem, XS} <- lists:zip(List, Appendable)],
lists:append(Appended, LeaveAlone).
Tests:
split_many_test_() ->
[
?_assertEqual([[1,2]], elibs_lists:split_many(1, [1,2])),
?_assertEqual([[1], [2]], elibs_lists:split_many(2, [1,2])),
?_assertEqual([[1], [3,2]], elibs_lists:split_many(2, [1,2,3])),
?_assertEqual([[1], [2], [4,3]], elibs_lists:split_many(3, [1,2,3,4])),
?_assertEqual([[1,2], [5,3,4]], elibs_lists:split_many(2, [1,2,3,4,5])),
?_assert(proper:quickcheck(split_many_proper1())),
?_assert(proper:quickcheck(split_many_proper2()))
].
%% #doc Verify all elements are preserved, number of groups is correct,
%% all groups have same number of elements (+-1)
split_many_proper1() ->
?FORALL({List, Groups},
{list(), pos_integer()},
begin
Split = elibs_lists:split_many(Groups, List),
% Lengths of sub-lists
Lengths = lists:usort(lists:map(fun erlang:length/1, Split)),
length(Split) =:= Groups andalso
lists:sort(lists:append(Split)) == lists:sort(List) andalso
length(Lengths) =< 2 andalso
case Lengths of
[Min, Max] -> Max == Min + 1;
[_] -> true
end
end
).
%% #doc If number of groups is divisable by number of elements, ordering must
%% stay the same
split_many_proper2() ->
?FORALL({Groups, List},
?LET({A, B},
{integer(1, 20), integer(1, 10)},
{A, vector(A*B, term())}),
List =:= lists:append(elibs_lists:split_many(Groups, List))
).
Here is a more general answer that works with any sublist size.
1> lists:foreach(fun(N) -> io:format("~2.10.0B -> ~w~n",[N, test:partition([1,2,3,4,5,6,7,8,9,10],N)] ) end, [1,2,3,4,5,6,7,8,9,10]).
01 -> [[1],[2],[3],[4],[5],[6],[7],[8],[9],[10]]
02 -> [[1,2],[3,4],[5,6],[7,8],[9,10]]
03 -> [[1,2,3],[4,5,6],[7,8,9],[10]]
04 -> [[1,2,3,4],[5,6,7,8],[10,9]]
05 -> [[1,2,3,4,5],[6,7,8,9,10]]
06 -> [[1,2,3,4,5,6],[10,9,8,7]]
07 -> [[1,2,3,4,5,6,7],[10,9,8]]
08 -> [[1,2,3,4,5,6,7,8],[10,9]]
09 -> [[1,2,3,4,5,6,7,8,9],[10]]
10 -> [[1,2,3,4,5,6,7,8,9,10]]
And the code to achieve this is stored inside a file called test.erl:
-module(test).
-compile(export_all).
partition(List, N) ->
partition(List, 1, N, []).
partition([], _C, _N, Acc) ->
lists:reverse(Acc) ;
partition([H|T], 1, N, Acc) ->
partition(T, 2, N, [[H]|Acc]) ;
partition([H|T], C, N, [HAcc|TAcc]) when C < N ->
partition(T, C+1, N, [[H|HAcc]|TAcc]) ;
partition([H|T], C, N, [HAcc|TAcc]) when C == N ->
partition(T, 1, N, [lists:reverse([H|HAcc])|TAcc]) ;
partition(L, C, N, Acc) when C > N ->
partition(L, 1, N, Acc).
It could probably be more elegant regarding the special case where C > N. Note that C is the size of the current sublist being constructed. At start, it is 1. And then it increments until it reaches the partition size of N.
We could also use a modified version of #chops code to let the last list contains the remaining items even if its size < N :
-module(n_length_chunks_fast).
-export([n_length_chunks_fast/2]).
n_length_chunks_fast(List,Len) ->
SkipLength = case length(List) rem Len of
0 -> 0;
N -> Len - N
end,
n_length_chunks_fast(lists:reverse(List),[],SkipLength,Len).
n_length_chunks_fast([],Acc,_Pos,_Max) -> Acc;
n_length_chunks_fast([H|T],Acc,Pos,Max) when Pos==Max ->
n_length_chunks_fast(T,[[H] | Acc],1,Max);
n_length_chunks_fast([H|T],[HAcc | TAcc],Pos,Max) ->
n_length_chunks_fast(T,[[H | HAcc] | TAcc],Pos+1,Max);
n_length_chunks_fast([H|T],[],Pos,Max) ->
n_length_chunks_fast(T,[[H]],Pos+1,Max).
I've slightly altered the implementation from #JLarky to remove the guard expression, which should be slightly faster:
split_list(List, Max) ->
element(1, lists:foldl(fun
(E, {[Buff|Acc], 1}) ->
{[[E],Buff|Acc], Max};
(E, {[Buff|Acc], C}) ->
{[[E|Buff]|Acc], C-1};
(E, {[], _}) ->
{[[E]], Max}
end, {[], Max}, List)).

The most efficient way to read a file into a list of strings

What is the most efficient way from the time consumed to read a text file into a list of binary strings in erlang ? The obvious solution
-module(test).
-export([run/1]).
open_file(FileName, Mode) ->
{ok, Device} = file:open(FileName, [Mode, binary]),
Device.
close_file(Device) ->
ok = file:close(Device).
read_lines(Device, L) ->
case io:get_line(Device, L) of
eof ->
lists:reverse(L);
String ->
read_lines(Device, [String | L])
end.
run(InputFileName) ->
Device = open_file(InputFileName, read),
Data = read_lines(Device, []),
close_file(Device),
io:format("Read ~p lines~n", [length(Data)]).
becomes too slow when the file contains more than 100000 lines.
{ok, Bin} = file:read_file(Filename).
or if you need the contents line by line
read(File) ->
case file:read_line(File) of
{ok, Data} -> [Data | read(File)];
eof -> []
end.
read the entire file in into a binary. Convert to a list and rip out the lines.
This is far more efficient than any other method. If you don't believe me time
it.
file2lines(File) ->
{ok, Bin} = file:read_file(File),
string2lines(binary_to_list(bin), []).
string2lines("\n" ++ Str, Acc) -> [reverse([$\n|Acc]) | string2lines(Str,[])];
string2lines([H|T], Acc) -> string2lines(T, [H|Acc]);
string2lines([], Acc) -> [reverse(Acc)].

Resources