Cannot properly encode a sample text using huffman - erlang

The problem occurs in the find function where the erlang shell tells me that an exception error has occurred, it says:
Exception error: no function clause matching seminar1:find("t", []) (seminar1.erl, line 117) in function seminar1:encode/3 ( seminar1.erl, line 113).
I believe what is happening is that the pattern-matching done in the first find function is always failing, though I do not understand why since attempts to do comparisons manually have been successful.
-module(seminar1).
-compile(export_all).
sample() -> "the quick brown fox jumps over the lazy dog
this is a sample text that we will use when we build
up a table we will only handle lower case letters and
no punctuation symbols the frequency will of course not
represent english but it is probably not that far off".
text() -> "this is something that we should encode".
test() ->
Sample = sample(),
Tree = tree(Sample),
Encode = encode_table(Tree),
Decode = decode_table(Tree),
Text = text(),
Seq = encode(Text, Encode),
Text = decode(Seq, Decode).
tree(Sample) -> Freq = freq(Sample),
F = fun({node,N1,V1,_,_}, {node,N2,V2,_,_}) ->
if
V1 > V2 -> false;
V1 == V2 -> if
N1 > N2 -> false;
true -> true
end;
true -> true
end
end,
%lists:sort(F,Freq).
huffman(lists:sort(F,Freq)).
% Calculate the frequency of each letter in the Sample and return a
datastructure of nodes containing the letter involved,
% frequency of it in the sample.
% datastructure {node, Key, Value, Left, Right}
freq(Sample) -> freq(Sample, []).
freq([], Freq) -> Freq;
freq([Char|Rest], Freq) -> freq(Rest, check(Char, Freq)).
% Check function complements the Freq function, it takes the current input
and pattern matches it with the frequency datastructue being built.
% If it scores a hit that particular node has its frequency incremented and
then the whole datastructure is returned.
check(Key, []) -> [{node, [Key], 1, nil, nil}];
check(Key, [{node, [Key], Value, nil, nil}| Tail]) -> [{node, [Key], Value +
1, nil, nil}| Tail];
check(Key, [H|T]) -> [H |check(Key, T)].
% Creates the Huffman tree that is later used to encode a sample.
% The input is the SORTED datastructure derived from the freq-function.
% The leaves of the huffman tree are where actual values reside, branches
are just nodes containing information.
huffman( [ Tree | [] ] ) -> Tree;
huffman([{node, LeftKey, LeftValue, _L1, _R1},
{node, RightKey, RightValue, _L2, _R2} |Tail]) ->
% Creating a branch node
BranchNode = {node, LeftKey ++ RightKey, LeftValue + RightValue, {node,
LeftKey, LeftValue, _L1, _R1}, {node, RightKey, RightValue, _L2, _R2}},
huffman(insert(BranchNode, Tail)).
% A complementary function to the huffman function, inserts the newly made
branchnode into the already sorted tail.
% This is to prevent the sorted tail from becoming unsorted when turning the
tail list into a tree.
% It is inserted as such that the branchnode is the first selection of its
current value,
% meaning that if you have 4 nodes of value 5 ( one being a branchnode) then
the branch node will be the first option.
% This will make the Tree structure left leaning.
%
% N
% N N
% N N
% N N N N
insert(Node, []) -> [Node|[]];
insert(Node, [H|T]) ->
{_, _, Nvalue, _, _} = Node,
{_, _, Hdvalue, _, _} = H,
if
Nvalue =< Hdvalue -> [ Node | [H|T]];
true -> [H | insert(Node, T)]
end.
% Takes the tree created by the huffman-function as input and traverses said
tree.
% Returns a list containing the letters found and their position in the
tree, Left = 0, Right = 1.
% {"e"/[101], [0,0,0]} -- {[Key], [pathway]}
% Traversal method used: Left based traversal.
encode_table(RootNode) -> encode_table(RootNode, [], []).
% When traversing the Tree I need to know the branchnode I am in, the result
list as I am adding letters to it and a PathwayList which is the current
binary path to the branchnode I am in.
encode_table({_, Key, _, nil, nil}, AccList, PathwayList) ->
[AccList | [{Key, reverse(PathwayList)}]];
encode_table({_, _, _, Left, Right}, AccList, PathwayList) ->
encode_table(
Right,
encode_table(Left, AccList, [0| PathwayList]),
[1|PathwayList]).
% Complementary function for the encode_table/3 function, when traversing
the tree the the pathway gets reversed so it needs to be corrected.
reverse(L) -> reverse(L, []).
reverse([], Rev) -> Rev;
reverse([H|T], Rev) -> reverse(T, [H|Rev]).
% Takes a sample text and encodes it in accordance to the encoding table
supplied
encode(Text, Table) -> encode(Text, Table, []).
encode([], _, EncodedText) -> EncodedText;
encode([Letter|Rest], Table, EncodedText) ->
encode(Rest, Table, [find([Letter], Table) | EncodedText]).
% Complementary function to encode/3, searches the Table for the related
Letters binary path.
%find(Letter, []) -> Letter;
find(Letter, [{Letter, BinaryPath} | _Rest]) ->
BinaryPath;
find(Letter, [ _ | Rest]) ->
find(Letter, Rest).
decode_table(tree) -> ok.
decode(sequence, table) -> ok.
test(Letter, [{Letter, Asd} | []]) ->
true;
test(_, _) -> false.

I have tried to follow your code , but I am stuck on the function decode_table(tree) -> ok.. With this spelling, it fails (tree is an atom and won't match anything but tree itself). Change to _Tree to ignore the issue when I understood that the decode functions are not written yet or not provided.
For the encoding, the trouble is that the function encode_table returns a nested list, not suitable with the find function. If you replace the code by encode_table(RootNode) -> lists:flatten(encode_table(RootNode, [], [])). then it works (at least it seems to work since I don't know which result you are expecting)
-module(seminar1).
-compile(export_all).
sample() -> "the quick brown fox jumps over the lazy dog
this is a sample text that we will use when we build
up a table we will only handle lower case letters and
no punctuation symbols the frequency will of course not
represent english but it is probably not that far off".
text() -> "this is something that we should encode".
test() ->
Sample = sample(),
Tree = tree(Sample),
Encode = encode_table(Tree),
%Decode = decode_table(Tree),
Text = text(),
Seq = encode(Text, Encode),
%Text = decode(Seq, Decode).
Seq.
tree(Sample) -> Freq = freq(Sample),
F = fun({node,N1,V1,_,_}, {node,N2,V2,_,_}) ->
if
V1 > V2 -> false;
V1 == V2 -> if
N1 > N2 -> false;
true -> true
end;
true -> true
end
end,
%lists:sort(F,Freq).
huffman(lists:sort(F,Freq)).
% Calculate the frequency of each letter in the Sample and return a
% datastructure of nodes containing the letter involved,
% frequency of it in the sample.
% datastructure {node, Key, Value, Left, Right}
freq(Sample) -> freq(Sample, []).
freq([], Freq) -> Freq;
freq([Char|Rest], Freq) -> freq(Rest, check(Char, Freq)).
% Check function complements the Freq function, it takes the current input
% and pattern matches it with the frequency datastructue being built.
% If it scores a hit that particular node has its frequency incremented and
% then the whole datastructure is returned.
check(Key, []) ->
[{node, [Key], 1, nil, nil}];
check(Key, [{node, [Key], Value, nil, nil}| Tail]) ->
[{node, [Key], Value + 1, nil, nil}| Tail];
check(Key, [H|T]) ->
[H |check(Key, T)].
% Creates the Huffman tree that is later used to encode a sample.
% The input is the SORTED datastructure derived from the freq-function.
% The leaves of the huffman tree are where actual values reside, branches
% are just nodes containing information.
huffman( [ Tree | [] ] ) -> Tree;
huffman([{node, LeftKey, LeftValue, _L1, _R1},
{node, RightKey, RightValue, _L2, _R2} |Tail]) ->
% Creating a branch node
BranchNode = {node, LeftKey ++ RightKey, LeftValue + RightValue, {node, LeftKey, LeftValue, _L1, _R1}, {node, RightKey, RightValue, _L2, _R2}},
huffman(insert(BranchNode, Tail)).
% A complementary function to the huffman function, inserts the newly made
% branchnode into the already sorted tail.
% This is to prevent the sorted tail from becoming unsorted when turning the
% tail list into a tree.
% It is inserted as such that the branchnode is the first selection of its
% current value,
% meaning that if you have 4 nodes of value 5 ( one being a branchnode) then
% the branch node will be the first option.
% This will make the Tree structure left leaning.
%
% N
% N N
% N N
% N N N N
insert(Node, []) -> [Node|[]];
insert(Node, [H|T]) ->
{_, _, Nvalue, _, _} = Node,
{_, _, Hdvalue, _, _} = H,
if
Nvalue =< Hdvalue -> [ Node | [H|T]];
true -> [H | insert(Node, T)]
end.
% Takes the tree created by the huffman-function as input and traverses said tree.
% Returns a list containing the letters found and their position in the
% tree, Left = 0, Right = 1.
% {"e"/[101], [0,0,0]} -- {[Key], [pathway]}
% Traversal method used: Left based traversal.
encode_table(RootNode) -> lists:flatten(encode_table(RootNode, [], [])).
% When traversing the Tree I need to know the branchnode I am in, the result
% list as I am adding letters to it and a PathwayList which is the current
% binary path to the branchnode I am in.
encode_table({_, Key, _, nil, nil}, AccList, PathwayList) ->
[AccList | [{Key, reverse(PathwayList)}]];
encode_table({_, _, _, Left, Right}, AccList, PathwayList) ->
encode_table(
Right,
encode_table(Left, AccList, [0| PathwayList]),
[1|PathwayList]).
% Complementary function for the encode_table/3 function, when traversing
% the tree the the pathway gets reversed so it needs to be corrected.
reverse(L) -> reverse(L, []).
reverse([], Rev) -> Rev;
reverse([H|T], Rev) -> reverse(T, [H|Rev]).
% Takes a sample text and encodes it in accordance to the encoding table supplied
encode(Text, Table) -> encode(Text, Table, []).
encode([], _, EncodedText) -> EncodedText;
encode([Letter|Rest], Table, EncodedText) ->
encode(Rest, Table, [find([Letter], Table) | EncodedText]).
% Complementary function to encode/3, searches the Table for the related
% Letters binary path.
find(Letter, []) -> Letter;
find(Letter, [{Letter, BinaryPath} | _Rest]) ->
BinaryPath;
find(Letter, [ _ | Rest]) ->
find(Letter, Rest).
decode_table(_Tree) -> ok.
decode(sequence, table) -> ok.
test(Letter, [{Letter, _Asd} | []]) ->
true;
test(_, _) -> false.
gives the result:
64> c(seminar1).
{ok,seminar1}
65> rp(seminar1:test()).
[[0,0,0],
[1,0,0,0,1,0],
[0,1,1,1],
[1,0,1,1,0,0],
[0,1,0,0],
[0,0,0],
[1,1,1],
[1,0,0,0,1,0],
[1,0,0,1],
[1,1,0,1,0],
[0,1,1,1],
[1,0,1,0,0],
[0,1,0,1],
[1,1,1],
[0,0,0],
[1,0,1,0,1],
[1,1,1],
[1,1,0,0],
[0,0,1,1],
[1,0,1,0,0],
[1,1,0,0],
[1,1,1],
[1,0,0,0,0,0,0],
[0,1,0,0],
[1,1,0,1,1],
[1,0,1,0,0],
[1,1,0,0],
[0,0,0],
[1,0,0,0,1,1,1],
[0,1,1,1],
[0,1,0,1],
[1,1,1],
[0,1,0,1],
[1,1,0,1,1],
[1,1,1],
[0,1,0,1],
[1,1,0,1,1],
[1,0,1,0,0],
[1,1,0,0]]
ok
66>
Edit
You will get the same result if you replace
encode_table({_, Key, _, nil, nil}, AccList, PathwayList) ->
[AccList | [{Key, reverse(PathwayList)}]];
which is responsible for the nested result by this version which directly produces a flat list.
encode_table({_, Key, _, nil, nil}, AccList, PathwayList) ->
[{Key, reverse(PathwayList)} | AccList];
It is the general way to build a list: [Head|Tail] where Head is any erlang term and Tail is a list. Your code produces a result like [[[],{Key1,Path1}],{Key2,Path2}] while my version gives [{Key2,Path2},{Key1,Path1}]
Some remarks :
In erlang, the usage of if is not frequent, in my opinion mainly because of the last true -> DoSomething() clause which is in most cases very inexpressive.
Another point, searching in a list is not very fast, it is not an issue for isolate search, but in your case the encode and decode functions are doing it for each character, in my opinion, a map is more appropriate to store the Encode and Decode tables than a key/value list.

Related

List of tuples [{id, [<List>]}, {id2, [<List>]} ] where ids are the second item of the tuple of the original list- Erlang

The title^ is kinda confusing but I will illustrate what I want to achieve:
I have:
[{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538077790705827">>},
{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538078530667847">>},
{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538077778390908">>},
{<<"5b71d7e458c37fa04a7ce768">>,<<"5bad45b1e990057961313822">>,<<"1538082492283531">>
}]
I want to convert it to a list like this:
[
{<<"5b3f77502dfe0deeb8912b42">>,
[{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538077790705827">>},
{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538078530667847">>},
{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538077778390908">>}
]},
{<<"5bad45b1e990057961313822">>,
[{<<"5b71d7e458c37fa04a7ce768">>,<<"5bad45b1e990057961313822">>,<<"1538082492283531">>}
]}
]
List of tuples [{id, [<List>]}, {id2, [<List>]} ] where ids are the second item of the tuple of the original list
Example :
<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538077790705827">>
Erlang newbie here. I created a dict with the second members of the tuples as keys and lists of corresponding tuples as values, then used dict:fold to transform it into the expected output format.
-export([test/0, transform/1]).
transform([H|T]) ->
transform([H|T], dict:new()).
transform([], D) ->
lists:reverse(
dict:fold(fun (Key, Tuples, Acc) ->
lists:append(Acc,[{Key,Tuples}])
end,
[],
D));
transform([Tuple={_S1,S2,_S3}|T], D) ->
transform(T, dict:append_list(S2, [Tuple], D)).
test() ->
Input=[{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538077790705827">>},
{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538078530667847">>},
{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538077778390908">>},
{<<"5b71d7e458c37fa04a7ce768">>,<<"5bad45b1e990057961313822">>,<<"1538082492283531">>}
],
Output=transform(Input),
case Output of
[
{<<"5b3f77502dfe0deeb8912b42">>,
[{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538077790705827">>},
{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538078530667847">>},
{<<"5b71d7e458c37fa04a7ce768">>,<<"5b3f77502dfe0deeb8912b42">>,<<"1538077778390908">>}
]},
{<<"5bad45b1e990057961313822">>,
[{<<"5b71d7e458c37fa04a7ce768">>,<<"5bad45b1e990057961313822">>,<<"1538082492283531">>}
]}
] -> ok;
_Else -> error
end.
I think I see what you're after... Please correct me if I'm wrong.
There are a number of ways to do this, it really just depends on what sort of data structure you're interested in using to check the presence of like-keys. I'll show you two fundamentally different ways to do this and a third hybrid method that has become recently available:
Indexed data types (in this case a map)
List operations with matching
Hybrid matching over map keys
Since you're new I'll use the first case to demonstrate two ways of writing it: explicit recursion and using an actual list function from the lists module.
Indexy Data Types
The first way we'll do this is to use a hash table (aka "dict", "map", "hash", "K/V", etc.) and explicitly recurse through the elements, checking for the presence of the key encountered and adding it if it is missing, or appending to the list of values it points to if it does. We'll use an Erlang map for this. At the end of the function we'll convert the utility map back to a list:
explicit_convert(List) ->
Map = explicit_convert(List, maps:new()),
maps:to_list(Map).
explicit_convert([H | T], A) ->
K = element(2, H),
NewA =
case maps:is_key(K, A) of
true ->
V = maps:get(K, A),
maps:put(K, [H | V], A);
false ->
maps:put(K, [H], A)
end,
explicit_convert(T, NewA);
explicit_convert([], A) ->
A.
There is nothing wrong with explicit recursion (it is particularly good if you're new, because every part of it is left in the open to be examined), but this is a "left fold" and we already have a library function that abstracts a little bit of the plumbing out. So we really only need to write a function that checks for the presence of an element, and adds the key or appends the value:
fun_convert(List) ->
Map = lists:foldl(fun convert/2, maps:new(), List),
maps:to_list(Map).
convert(H, A) ->
K = element(2, H),
case maps:is_key(K, A) of
true ->
V = maps:get(K, A),
maps:put(K, [H | V], A);
false ->
maps:put(K, [H], A)
end.
Listy Conversion
The other major way we could have done this is with listy matching. To do that you need to first guarantee that your elements are sorted on the element you want to use as a key so that you can use it as a sort of "working element" and match on it. The code should be pretty easy to understand once you stare at it for a bit (maybe write out how it will step through your list by hand on paper once if you're totally perplexed):
listy_convert(List) ->
[T = {_, K, _} | Rest] = lists:keysort(2, List),
listy_convert(Rest, {K, [T]}, []).
listy_convert([T = {_, K, _} | Rest], {K, Ts}, Acc) ->
listy_convert(Rest, {K, [T | Ts]}, Acc);
listy_convert([T = {_, K, _} | Rest], Done, Acc) ->
listy_convert(Rest, {K, [T]}, [Done | Acc]);
listy_convert([], Done, Acc) ->
[Done | Acc].
Note that we split the list immediately after sorting it. The reason is that we have "prime the pump", so to speak, on the first call we make to listy_convert/3. This also means that this function will crash if you pass it an empty list. You can solve that by adding a clause to listy_convert/1 that matches on the empty list [].
A Final Bit of Magic
With those firmly in mind... consider that we also have a bit of a hybrid option available in newer versions of Erlang due to the magical syntax available to maps. We can match (most values) on map keys inside of a case clause (though we can't unify on a key value provided by other arguments within a function head):
map_convert(List) ->
maps:to_list(map_convert(List, #{})).
map_convert([T = {_, K, _} | Rest], Acc) ->
case Acc of
#{K := Ts} -> map_convert(Rest, Acc#{K := [T | Ts]});
_ -> map_convert(Rest, Acc#{K => [T]})
end;
map_convert([], Acc) ->
Acc.
Here is a one-liner that would produce your expected result:
[{K, [E || {_, K2, _} = E <- List, K =:= K2]} || {_, K, _} <- lists:ukeysort(2, List)].
What’s going on here? Let’s do it step by step…
This is your original list
List = […],
lists:ukeysort/2 leaves just one element per key in the list
OnePerKey = lists:ukeysort(2, List),
We then extract the keys with the first list comprehension
Keys = [K || {_, K, _} <- OnePerKey],
With the second list comprehension, we find the elements with the key…
fun Filter(K, List) ->
[E || {_, K2, _} = E <- List, K =:= K2]
end
Keep in mind that we can’t just pattern-match with K in the generator (i.e. [E || {_, K, _} = E <- List]) because generators in LCs introduce new scope for the variables.
Finally, putting all together…
[{K, Filter(K, List)} || K <- Keys]
It really depends on your dataset. For lager data sets using maps is a bit more efficient.
-module(test).
-export([test/3, v1/2, v2/2, v3/2, transform/1, do/2]).
test(N, Keys, Size) ->
List = [{<<"5b71d7e458c37fa04a7ce768">>,rand:uniform(Keys),<<"1538077790705827">>} || I <- lists:seq(1,Size)],
V1 = timer:tc(test, v1, [N, List]),
V2 = timer:tc(test, v2, [N, List]),
V3 = timer:tc(test, v3, [N, List]),
io:format("V1 took: ~p, V2 took: ~p V3 took: ~p ~n", [V1, V2, V3]).
v1(N, List) when N > 0 ->
[{K, [E || {_, K2, _} = E <- List, K =:= K2]} || {_, K, _} <- lists:ukeysort(2, List)],
v1(N-1, List);
v1(_,_) -> ok.
v2(N, List) when N > 0 ->
do(List,maps:new()),
v2(N-1, List);
v2(_,_) -> ok.
v3(N, List) when N > 0 ->
transform(List),
v3(N-1, List);
v3(_,_) -> ok.
do([], R) -> maps:to_list(R);
do([H={_,K,_}|T], R) ->
case maps:get(K,R,null) of
null -> NewR = maps:put(K, [H], R);
V -> NewR = maps:update(K, [H|V], R)
end,
do(T, NewR).
transform([H|T]) ->
transform([H|T], dict:new()).
transform([], D) ->
lists:reverse(
dict:fold(fun (Key, Tuples, Acc) ->
lists:append(Acc,[{Key,Tuples}])
end,
[],
D));
transform([Tuple={_S1,S2,_S3}|T], D) ->
transform(T, dict:append_list(S2, [Tuple], D)).
Running both with 100 unique keys and 100,000 records I get:
> test:test(1,100,100000).
V1 took: {75566,ok}, V2 took: {32087,ok} V3 took: {887362,ok}
ok

Remove list element occur only once

I have a list in erlang containing interger values.
I want to remove values that occur only one time.(Not Duplicates).
Input = [1,3,2,1,2,2]
Output = [1,2,1,2,2]
I am newbie to erlang. I have tried an approach to sorting them first using list:sort() and then removing a member if the member next to it is the same.
I am having trouble trying to iterate the list. It would be great help if you can show me how I can do it.
multiple(L) ->
M = L -- lists:usort(L),
[X || X <- L , lists:member(X,M)].
Use map to count values and then filter values which was not present just once.
-module(test).
-export([remove_unique/1]).
remove_unique(L) ->
Count = lists:foldl(fun count/2, #{}, L),
lists:filter(fun(X) -> maps:get(X, Count) =/= 1 end, L).
count(X, M) ->
maps:put(X, maps:get(X, M, 0) + 1, M).
And test:
1> c(test).
{ok,test}
2> test:remove_unique([1,2,3,3,3,5,5,6,7,7]).
[3,3,3,5,5,7,7]
3> test:remove_unique([1,2,3,3,3,5,5,6,7,8]).
[3,3,3,5,5]
4> test:remove_unique([1,3,2,1,2,2]).
[1,2,1,2,2]
Here's a solution I'd written when first seeing the question when posted, that uses the same logic as #A.Sarid's recursion/pattern matching answer, except that I use a "Last" parameter instead of the count.
-module(only_dupes).
-export([process/1]).
process([]) -> [];
process(L) when is_list(L) ->
[H|T] = lists:sort(L),
lists:sort(process(undefined, H, T, [])).
process(Last, Curr, [], Acc)
when Curr =/= Last ->
Acc;
process(_Last, Curr, [], Acc) ->
[Curr | Acc];
process(Last, Curr, [Next | Rest], Acc)
when Curr =/= Last, Curr =/= Next ->
process(Curr, Next, Rest, Acc);
process(_Last, Curr, [Next | Rest], Acc) ->
process(Curr, Next, Rest, [Curr | Acc]).
One way for iterating a list (that as a result will return a new list) is using recursion and pattern matching.
After you sort your list you want to iterate the list and to check not only that it is different from the next element, but that there was no other equal elements before it. Consider the list [3,3,3,5,5] if you will only check the next element, the last 3 will also be unique and that is incorrect.
Here is a working program, I used a counter to cover the above case as well. See the syntax for using [H|T] for iterating over the list. You may see more cases and read more about it here.
-module(test).
-export([remove_unique/1]).
remove_unique(Input) ->
Sorted = lists:sort(Input),
remove_unique(Sorted, [], 0).
% Base case - checks if element is unique
remove_unique([H|[]],Output,Count) ->
case Count of
0 -> Output;
_Other -> [H|Output]
end;
% Count is 0 - might be unique - check with next element
remove_unique([H1|[H2|T]],Output, 0)->
case (H1 =:= H2) of
true -> remove_unique([H2|T],[H1|Output],1);
false -> remove_unique([H2|T],Output,0)
end;
% Count is > 0 - not unique - proceed adding to list until next value
remove_unique([H1|[H2|T]],Output,Count) ->
case (H1 =:= H2) of
true -> remove_unique([H2|T],[H1|Output],Count+1);
false -> remove_unique([H2|T],[H1|Output],0)
end.
Test
7> test:remove_unique([1,2,3,3,3,5,5,6,7,7]).
[7,7,5,5,3,3,3]
8> test:remove_unique([1,2,3,3,3,5,5,6,7,8]).
[5,5,3,3,3]

How to split a list of strings into given number of lists in erlang

Given a list and an integer, I want to split that list into the specified number of lists (inside a list).
For example:
Input:
[1,2,3,4,5,6,7,8,9], 3
Output:
[[1,2,3],[4,5,6],[7,8,9]]
What is a clean and efficient way to do this?
The solution written by Steve Vinoski calls length/1 in guard for each partition which makes it O(N^2). It simply bothers me because it can be done in O(N) and I am performance freak. It can be done in many ways so just for example there is one:
divide(L, N) when is_integer(N), N > 0 ->
divide(N, 0, L, []).
divide(_, _, [], Acc) ->
[lists:reverse(Acc)];
divide(N, N, L, Acc) ->
[lists:reverse(Acc) | divide(N, 0, L, [])];
divide(N, X, [H|T], Acc) ->
divide(N, X+1, T, [H|Acc]).
or as a modification of Steve's solution
divide(L, N) ->
divide(L, N, []).
divide([], _, Acc) ->
lists:reverse(Acc);
divide(L, N, Acc) ->
try lists:split(N, L) of
{H,T} -> divide(T, N, [H|Acc])
catch
error:badarg ->
lists:reverse([L|Acc])
end.
or even simpler:
divide([], _) -> [];
divide(L, N) ->
try lists:split(N, L) of
{H,T} -> [H|divide(T, N)]
catch
error:badarg -> [L]
end.
You can use lists:split/2 for this:
divide(L, N) ->
divide(L, N, []).
divide([], _, Acc) ->
lists:reverse(Acc);
divide(L, N, Acc) when length(L) < N ->
lists:reverse([L|Acc]);
divide(L, N, Acc) ->
{H,T} = lists:split(N, L),
divide(T, N, [H|Acc]).
The first function, divide/2, serves as the entry point. It merely calls the helper function divide/3 with an initial accumulator value of an empty list, and then divide/3 does all the work. The first clause of divide/3 matches when the list has been completely processed, so it just reverses the accumulator and returns that value. The second clause handles the case when the length of L is less than the requested N value; it creates a new accumulator by prepending Acc with L and then returning the reverse of that new accumulator. The third clause first calls lists:split/2 to split the incoming list into H, which is a list of N elements, and T, the remainder of the list. It then calls itself recursively, passing T as the new list value, the original N value, and a new accumulator consisting of H as the first element and the original accumulator, Acc, as the tail.

How to collect frequencies of characters using a list of tuples {char,freq} in Erlang

I am supposed to collect frequencies of characters.
freq(Sample) -> freq(Sample,[]).
freq([],Freq) ->
Freq;
freq([Char|Rest],Freq)->
freq(Rest,[{Char,1}|Freq]).
This function does not work in the right way. If the input is "foo", then the output will be
[{f,1},{o,1},{o,1}].
But I wished to have the output like
[{f,1},{o,2}].
I can't manage to modify element in a tulpe. Can anyone help me out of this and show me how it can be fixed?
a one line solution :o)
% generate a random list
L = [random:uniform(26)+$a-1 || _ <- lists:seq(1,1000)].
% collect frequency
lists:foldl(fun(X,[{[X],I}|Q]) -> [{[X],I+1}|Q] ; (X,Acc) -> [{[X],1}|Acc] end , [], lists:sort(L)).
in action
1> lists:foldl(fun(X,[{[X],I}|Q]) -> [{[X],I+1}|Q] ; (X,Acc) -> [{[X],1}|Acc] end , [], lists:sort("foo")).
[{"o",2},{"f",1}]
quite fast with short list, but the execution time increase a lot with long list (on my PC, it needs 6.5s for a 1 000 000 character text) .
in comparison, with the same 1 000 000 character text Ricardo solution needs 5 sec
I will try another version using ets.
By far the easiest way is to use an orddict to store the value as it already comes with an update_counter function and returns the value in a (sorted) list.
freq(Text) ->
lists:foldl(fun (C, D) -> orddict:update_counter(C, 1, D) end, orddict:new(), Text).
Try with something like this:
freq(Text) ->
CharsDictionary = lists:foldl(fun(Char, Acc) -> dict:update_counter(Char, 1, Acc) end, dict:new(), Text),
dict:fold(fun(Char, Frequency, Acc) -> [{Char, Frequency} | Acc] end, [], CharsDictionary).
The first line creates a dictionary that uses the char as key and the frequency as value (dict:update_counter).
The second line converts the dictionary in the list that you need.
Using pattern matching and proplists.
-module(freq).
-export([char_freq/1]).
-spec char_freq(string()) -> [tuple()].
char_freq(L) -> char_freq(L, []).
char_freq([], PL) -> PL;
char_freq([H|T], PL) ->
case proplists:get_value([H], PL) of
undefined ->
char_freq(T, [{[H],1}|PL]);
N ->
L = proplists:delete([H], PL),
char_freq(T, [{[H],N+1}|L])
end.
Test
1> freq:char_freq("abacabz").
[{"z",1},{"b",2},{"a",3},{"c",1}]
L = [list_to_atom(X) || X <- Str].
D = lists:foldl(fun({Char, _}, Acc) -> dict:update_counter(Char, 1, Acc) end, dict:new(), L).
dict:to_list(D).

Splitting a list in equal sized chunks in Erlang

I want to split:
[1,2,3,4,5,6,7,8]
into:
[[1,2],[3,4],[5,6],[7,8]]
It generally works great with:
[ lists:sublist(List, X, 2) || X <- lists:seq(1,length(List),2) ] .
But it is really slow this way. 10000 Elements take amazing 2.5 seconds on my netbook. I have also written a really fast recursive function, but I am simply interested: Could this list comprehension also be written in a different way, so that it is faster?
Try this:
part(List) ->
part(List, []).
part([], Acc) ->
lists:reverse(Acc);
part([H], Acc) ->
lists:reverse([[H]|Acc]);
part([H1,H2|T], Acc) ->
part(T, [[H1,H2]|Acc]).
Test in erlang-shell (I've declared this function in module part):
2> part:part([1,2,3,4,5,6,7,8]).
[[1,2],[3,4],[5,6],[7,8]]
3>
3> timer:tc(part, part, [lists:seq(1,10000)]).
{774,
[[1,2],
[3,4],
[5,6],
[7,8],
"\t\n","\v\f",
[13,14],
[15,16],
[17,18],
[19,20],
[21,22],
[23,24],
[25,26],
[27,28],
[29,30],
[31,32],
"!\"","#$","%&","'(",")*","+,","-.","/0","12","34",
[...]|...]}
Just 774 microseconds (which is ~0,8 milliseconds)
Here are two quick solutions for you that are both flexible. One is easy to read, but only slightly faster than your proposed solution. The other is quite fast, but is a bit cryptic to read. And note that both of my proposed algorithms will work for lists of anything, not just numeric ordered lists.
Here is the "easy-to-read" one. Call by n_length_chunks(List,Chunksize). For example, to get a list of chunks 2 long, call n_length_chunks(List,2). This works for chunks of any size, ie, you could call n_length_chunks(List,4) to get [[1,2,3,4],[5,6,7,8],...]
n_length_chunks([],_) -> [];
n_length_chunks(List,Len) when Len > length(List) ->
[List];
n_length_chunks(List,Len) ->
{Head,Tail} = lists:split(Len,List),
[Head | n_length_chunks(Tail,Len)].
The much faster one is here, but is definitely harder to read, and is called in the same way: n_length_chunks_fast(List,2) (I've made one change to this compared with the one above, in that it pads the end of the list with undefined if the length of the list isn't cleanly divisible by the desired chunk length.
n_length_chunks_fast(List,Len) ->
LeaderLength = case length(List) rem Len of
0 -> 0;
N -> Len - N
end,
Leader = lists:duplicate(LeaderLength,undefined),
n_length_chunks_fast(Leader ++ lists:reverse(List),[],0,Len).
n_length_chunks_fast([],Acc,_,_) -> Acc;
n_length_chunks_fast([H|T],Acc,Pos,Max) when Pos==Max ->
n_length_chunks_fast(T,[[H] | Acc],1,Max);
n_length_chunks_fast([H|T],[HAcc | TAcc],Pos,Max) ->
n_length_chunks_fast(T,[[H | HAcc] | TAcc],Pos+1,Max);
n_length_chunks_fast([H|T],[],Pos,Max) ->
n_length_chunks_fast(T,[[H]],Pos+1,Max).
Tested on my (really old) laptop:
Your proposed solution took about 3 seconds.
My slow-but-readable one was slightly faster and takes about 1.5 seconds (still quite slow)
My fast version takes about 5 milliseconds.
For completeness, Isac's solution took about 180 milliseconds on my same machine.
Edit: wow, I need to read the complete question first. Oh well I'll keep here for posterity if it helps. As far as I can tell, there's not a good way to do this using list comprehensions. Your original version is slow because each iteration of sublist needs to traverse the list each time to get to each successive X, resulting in complexity just under O(N^2).
Or with a fold:
lists:foldr(fun(E, []) -> [[E]];
(E, [H|RAcc]) when length(H) < 2 -> [[E|H]|RAcc] ;
(E, [H|RAcc]) -> [[E],H|RAcc]
end, [], List).
I want to submit slightly complicated but more flexible (and mostly faster) solution of one proposed by #Tilman
split_list(List, Max) ->
element(1, lists:foldl(fun
(E, {[Buff|Acc], C}) when C < Max ->
{[[E|Buff]|Acc], C+1};
(E, {[Buff|Acc], _}) ->
{[[E],Buff|Acc], 1};
(E, {[], _}) ->
{[[E]], 1}
end, {[], 0}, List)).
so function part can be implemented as
part(List) ->
RevList = split_list(List, 2),
lists:foldl(fun(E, Acc) ->
[lists:reverse(E)|Acc]
end, [], RevList).
update
I've added reverse in case if you want to preserve order, but as I can see it adds no more than 20% of processing time.
You could do it like this:
1> {List1, List2} = lists:partition(fun(X) -> (X rem 2) == 1 end, List).
{[1,3,5|...],[2,4,6|...]}
2> lists:zipwith(fun(X, Y) -> [X, Y] end, List1, List2).
[[1,2],[3,4],[5,6]|...]
This takes ~73 milliseconds with a 10000 elements List on my computer. The original solution takes ~900 miliseconds.
But I would go with the recursive function anyway.
I was looking for a partition function which can split a large list to small amount of workers. With lkuty's partition you might get that one worker gets almost double work than all the others. If that's not what you want, here is a version which sublist lengths differ by at most 1.
Uses PropEr for testing.
%% #doc Split List into sub-lists so sub-lists lengths differ most by 1.
%% Does not preserve order.
-spec split_many(pos_integer(), [T]) -> [[T]] when T :: term().
split_many(N, List) ->
PieceLen = length(List) div N,
lists:reverse(split_many(PieceLen, N, List, [])).
-spec split_many(pos_integer(), pos_integer(), [T], [[T]]) ->
[[T]] when T :: term().
split_many(PieceLen, N, List, Acc) when length(Acc) < N ->
{Head, Tail} = lists:split(PieceLen, List),
split_many(PieceLen, N, Tail, [Head|Acc]);
split_many(_PieceLen, _N, List, Acc) ->
% Add an Elem to each list in Acc
{Appendable, LeaveAlone} = lists:split(length(List), Acc),
Appended = [[Elem|XS] || {Elem, XS} <- lists:zip(List, Appendable)],
lists:append(Appended, LeaveAlone).
Tests:
split_many_test_() ->
[
?_assertEqual([[1,2]], elibs_lists:split_many(1, [1,2])),
?_assertEqual([[1], [2]], elibs_lists:split_many(2, [1,2])),
?_assertEqual([[1], [3,2]], elibs_lists:split_many(2, [1,2,3])),
?_assertEqual([[1], [2], [4,3]], elibs_lists:split_many(3, [1,2,3,4])),
?_assertEqual([[1,2], [5,3,4]], elibs_lists:split_many(2, [1,2,3,4,5])),
?_assert(proper:quickcheck(split_many_proper1())),
?_assert(proper:quickcheck(split_many_proper2()))
].
%% #doc Verify all elements are preserved, number of groups is correct,
%% all groups have same number of elements (+-1)
split_many_proper1() ->
?FORALL({List, Groups},
{list(), pos_integer()},
begin
Split = elibs_lists:split_many(Groups, List),
% Lengths of sub-lists
Lengths = lists:usort(lists:map(fun erlang:length/1, Split)),
length(Split) =:= Groups andalso
lists:sort(lists:append(Split)) == lists:sort(List) andalso
length(Lengths) =< 2 andalso
case Lengths of
[Min, Max] -> Max == Min + 1;
[_] -> true
end
end
).
%% #doc If number of groups is divisable by number of elements, ordering must
%% stay the same
split_many_proper2() ->
?FORALL({Groups, List},
?LET({A, B},
{integer(1, 20), integer(1, 10)},
{A, vector(A*B, term())}),
List =:= lists:append(elibs_lists:split_many(Groups, List))
).
Here is a more general answer that works with any sublist size.
1> lists:foreach(fun(N) -> io:format("~2.10.0B -> ~w~n",[N, test:partition([1,2,3,4,5,6,7,8,9,10],N)] ) end, [1,2,3,4,5,6,7,8,9,10]).
01 -> [[1],[2],[3],[4],[5],[6],[7],[8],[9],[10]]
02 -> [[1,2],[3,4],[5,6],[7,8],[9,10]]
03 -> [[1,2,3],[4,5,6],[7,8,9],[10]]
04 -> [[1,2,3,4],[5,6,7,8],[10,9]]
05 -> [[1,2,3,4,5],[6,7,8,9,10]]
06 -> [[1,2,3,4,5,6],[10,9,8,7]]
07 -> [[1,2,3,4,5,6,7],[10,9,8]]
08 -> [[1,2,3,4,5,6,7,8],[10,9]]
09 -> [[1,2,3,4,5,6,7,8,9],[10]]
10 -> [[1,2,3,4,5,6,7,8,9,10]]
And the code to achieve this is stored inside a file called test.erl:
-module(test).
-compile(export_all).
partition(List, N) ->
partition(List, 1, N, []).
partition([], _C, _N, Acc) ->
lists:reverse(Acc) ;
partition([H|T], 1, N, Acc) ->
partition(T, 2, N, [[H]|Acc]) ;
partition([H|T], C, N, [HAcc|TAcc]) when C < N ->
partition(T, C+1, N, [[H|HAcc]|TAcc]) ;
partition([H|T], C, N, [HAcc|TAcc]) when C == N ->
partition(T, 1, N, [lists:reverse([H|HAcc])|TAcc]) ;
partition(L, C, N, Acc) when C > N ->
partition(L, 1, N, Acc).
It could probably be more elegant regarding the special case where C > N. Note that C is the size of the current sublist being constructed. At start, it is 1. And then it increments until it reaches the partition size of N.
We could also use a modified version of #chops code to let the last list contains the remaining items even if its size < N :
-module(n_length_chunks_fast).
-export([n_length_chunks_fast/2]).
n_length_chunks_fast(List,Len) ->
SkipLength = case length(List) rem Len of
0 -> 0;
N -> Len - N
end,
n_length_chunks_fast(lists:reverse(List),[],SkipLength,Len).
n_length_chunks_fast([],Acc,_Pos,_Max) -> Acc;
n_length_chunks_fast([H|T],Acc,Pos,Max) when Pos==Max ->
n_length_chunks_fast(T,[[H] | Acc],1,Max);
n_length_chunks_fast([H|T],[HAcc | TAcc],Pos,Max) ->
n_length_chunks_fast(T,[[H | HAcc] | TAcc],Pos+1,Max);
n_length_chunks_fast([H|T],[],Pos,Max) ->
n_length_chunks_fast(T,[[H]],Pos+1,Max).
I've slightly altered the implementation from #JLarky to remove the guard expression, which should be slightly faster:
split_list(List, Max) ->
element(1, lists:foldl(fun
(E, {[Buff|Acc], 1}) ->
{[[E],Buff|Acc], Max};
(E, {[Buff|Acc], C}) ->
{[[E|Buff]|Acc], C-1};
(E, {[], _}) ->
{[[E]], Max}
end, {[], Max}, List)).

Resources