Related
iam new in erlang and OTP i have two questions :
iam trying to read some erlang code from ejabberd source, it's very complex and iam surprised with two start/0 functions and two start/1 functions too in ejabberd_logger.erl so
1-what's this ? which function code will be executed when calling start/1?
2-should i read and understand all ejabberd source code to modify and customize it like i want ? i see that it's a huge code but is this a must for good handling ?
-module(ejabberd_logger).
-compile({no_auto_import, [get/0]}).
%% API
-export([start/0, get/0, set/1, get_log_path/0, flush/0]).
-export([convert_loglevel/1, loglevels/0]).
-ifndef(LAGER).
-export([progress_filter/2]).
-endif.
%% Deprecated functions
-export([restart/0, reopen_log/0, rotate_log/0]).
-deprecated([{restart, 0},
{reopen_log, 0},
{rotate_log, 0}]).
-type loglevel() :: none | emergency | alert | critical |
error | warning | notice | info | debug.
-define(is_loglevel(L),
((L == none) or (L == emergency) or (L == alert)
or (L == critical) or (L == error) or (L == warning)
or (L == notice) or (L == info) or (L == debug))).
-export_type([loglevel/0]).
%%%===================================================================
%%% API
%%%===================================================================
-spec get_log_path() -> string().
get_log_path() ->
case ejabberd_config:env_binary_to_list(ejabberd, log_path) of
{ok, Path} ->
Path;
undefined ->
case os:getenv("EJABBERD_LOG_PATH") of
false ->
"ejabberd.log";
Path ->
Path
end
end.
-spec loglevels() -> [loglevel(), ...].
loglevels() ->
[none, emergency, alert, critical, error, warning, notice, info, debug].
-spec convert_loglevel(0..5) -> loglevel().
convert_loglevel(0) -> none;
convert_loglevel(1) -> critical;
convert_loglevel(2) -> error;
convert_loglevel(3) -> warning;
convert_loglevel(4) -> info;
convert_loglevel(5) -> debug.
quiet_mode() ->
case application:get_env(ejabberd, quiet) of
{ok, true} -> true;
_ -> false
end.
-spec get_integer_env(atom(), T) -> T.
get_integer_env(Name, Default) ->
case application:get_env(ejabberd, Name) of
{ok, I} when is_integer(I), I>=0 ->
I;
{ok, infinity} ->
infinity;
undefined ->
Default;
{ok, Junk} ->
error_logger:error_msg("wrong value for ~ts: ~p; "
"using ~p as a fallback~n",
[Name, Junk, Default]),
Default
end.
-ifdef(LAGER).
-spec get_string_env(atom(), T) -> T.
get_string_env(Name, Default) ->
case application:get_env(ejabberd, Name) of
{ok, L} when is_list(L) ->
L;
undefined ->
Default;
{ok, Junk} ->
error_logger:error_msg("wrong value for ~ts: ~p; "
"using ~p as a fallback~n",
[Name, Junk, Default]),
Default
end.
-spec start() -> ok.
start() ->
start(info).
start(Level) ->
StartedApps = application:which_applications(5000),
case lists:keyfind(logger, 1, StartedApps) of
%% Elixir logger is started. We assume everything is in place
%% to use lager to Elixir logger bridge.
{logger, _, _} ->
error_logger:info_msg("Ignoring ejabberd logger options, using Elixir Logger.", []),
%% Do not start lager, we rely on Elixir Logger
do_start_for_logger(Level);
_ ->
do_start(Level)
end.
do_start_for_logger(Level) ->
application:load(sasl),
application:set_env(sasl, sasl_error_logger, false),
application:load(lager),
application:set_env(lager, error_logger_redirect, false),
application:set_env(lager, error_logger_whitelist, ['Elixir.Logger.ErrorHandler']),
application:set_env(lager, crash_log, false),
application:set_env(lager, handlers, [{elixir_logger_backend, [{level, Level}]}]),
ejabberd:start_app(lager),
ok.
do_start(Level) ->
application:load(sasl),
application:set_env(sasl, sasl_error_logger, false),
application:load(lager),
ConsoleLog = get_log_path(),
Dir = filename:dirname(ConsoleLog),
ErrorLog = filename:join([Dir, "error.log"]),
CrashLog = filename:join([Dir, "crash.log"]),
LogRotateDate = get_string_env(log_rotate_date, ""),
LogRotateSize = case get_integer_env(log_rotate_size, 10*1024*1024) of
infinity -> 0;
V -> V
end,
LogRotateCount = get_integer_env(log_rotate_count, 1),
LogRateLimit = get_integer_env(log_rate_limit, 100),
ConsoleLevel0 = case quiet_mode() of
true -> critical;
_ -> Level
end,
ConsoleLevel = case get_lager_version() >= "3.6.0" of
true -> [{level, ConsoleLevel0}];
false -> ConsoleLevel0
end,
application:set_env(lager, error_logger_hwm, LogRateLimit),
application:set_env(
lager, handlers,
[{lager_console_backend, ConsoleLevel},
{lager_file_backend, [{file, ConsoleLog}, {level, Level}, {date, LogRotateDate},
{count, LogRotateCount}, {size, LogRotateSize}]},
{lager_file_backend, [{file, ErrorLog}, {level, error}, {date, LogRotateDate},
{count, LogRotateCount}, {size, LogRotateSize}]}]),
application:set_env(lager, crash_log, CrashLog),
application:set_env(lager, crash_log_date, LogRotateDate),
application:set_env(lager, crash_log_size, LogRotateSize),
application:set_env(lager, crash_log_count, LogRotateCount),
ejabberd:start_app(lager),
lists:foreach(fun(Handler) ->
lager:set_loghwm(Handler, LogRateLimit)
end, gen_event:which_handlers(lager_event)).
-spec restart() -> ok.
restart() ->
Level = ejabberd_option:loglevel(),
application:stop(lager),
start(Level).
-spec reopen_log() -> ok.
reopen_log() ->
ok.
-spec rotate_log() -> ok.
rotate_log() ->
catch lager_crash_log ! rotate,
lists:foreach(
fun({lager_file_backend, File}) ->
whereis(lager_event) ! {rotate, File};
(_) ->
ok
end, gen_event:which_handlers(lager_event)).
-spec get() -> loglevel().
get() ->
Handlers = get_lager_handlers(),
lists:foldl(fun(lager_console_backend, _Acc) ->
lager:get_loglevel(lager_console_backend);
(elixir_logger_backend, _Acc) ->
lager:get_loglevel(elixir_logger_backend);
(_, Acc) ->
Acc
end,
none, Handlers).
-spec set(0..5 | loglevel()) -> ok.
set(N) when is_integer(N), N>=0, N=<5 ->
set(convert_loglevel(N));
set(Level) when ?is_loglevel(Level) ->
case get() of
Level ->
ok;
_ ->
ConsoleLog = get_log_path(),
QuietMode = quiet_mode(),
lists:foreach(
fun({lager_file_backend, File} = H) when File == ConsoleLog ->
lager:set_loglevel(H, Level);
(lager_console_backend = H) when not QuietMode ->
lager:set_loglevel(H, Level);
(elixir_logger_backend = H) ->
lager:set_loglevel(H, Level);
(_) ->
ok
end, get_lager_handlers())
end,
case Level of
debug -> xmpp:set_config([{debug, true}]);
_ -> xmpp:set_config([{debug, false}])
end.
get_lager_handlers() ->
case catch gen_event:which_handlers(lager_event) of
{'EXIT',noproc} ->
[];
Result ->
Result
end.
-spec get_lager_version() -> string().
get_lager_version() ->
Apps = application:loaded_applications(),
case lists:keyfind(lager, 1, Apps) of
{_, _, Vsn} -> Vsn;
false -> "0.0.0"
end.
-spec flush() -> ok.
flush() ->
application:stop(lager),
application:stop(sasl).
-else.
-include_lib("kernel/include/logger.hrl").
-spec start() -> ok | {error, term()}.
start() ->
start(info).
start(Level) ->
EjabberdLog = get_log_path(),
Dir = filename:dirname(EjabberdLog),
ErrorLog = filename:join([Dir, "error.log"]),
LogRotateSize = get_integer_env(log_rotate_size, 10*1024*1024),
LogRotateCount = get_integer_env(log_rotate_count, 1),
Config = #{max_no_bytes => LogRotateSize,
max_no_files => LogRotateCount,
filesync_repeat_interval => no_repeat,
file_check => 1000,
sync_mode_qlen => 1000,
drop_mode_qlen => 1000,
flush_qlen => 5000},
FmtConfig = #{legacy_header => false,
time_designator => $ ,
max_size => 100*1024,
single_line => false},
FileFmtConfig = FmtConfig#{template => file_template()},
ConsoleFmtConfig = FmtConfig#{template => console_template()},
try
ok = logger:set_primary_config(level, Level),
DefaultHandlerId = get_default_handlerid(),
ok = logger:update_formatter_config(DefaultHandlerId, ConsoleFmtConfig),
case quiet_mode() of
true ->
ok = logger:set_handler_config(DefaultHandlerId, level, critical);
_ ->
ok
end,
case logger:add_primary_filter(progress_report,
{fun ?MODULE:progress_filter/2, stop}) of
ok -> ok;
{error, {already_exist, _}} -> ok
end,
case logger:add_handler(ejabberd_log, logger_std_h,
#{level => all,
config => Config#{file => EjabberdLog},
formatter => {logger_formatter, FileFmtConfig}}) of
ok -> ok;
{error, {already_exist, _}} -> ok
end,
case logger:add_handler(error_log, logger_std_h,
#{level => error,
config => Config#{file => ErrorLog},
formatter => {logger_formatter, FileFmtConfig}}) of
ok -> ok;
{error, {already_exist, _}} -> ok
end
catch _:{Tag, Err} when Tag == badmatch; Tag == case_clause ->
?LOG_CRITICAL("Failed to set logging: ~p", [Err]),
Err
end.
get_default_handlerid() ->
Ids = logger:get_handler_ids(),
case lists:member(default, Ids) of
true -> default;
false -> hd(Ids)
end.
-spec restart() -> ok.
restart() ->
ok.
progress_filter(#{level:=info,msg:={report,#{label:={_,progress}}}} = Event, _) ->
case get() of
debug ->
logger_filters:progress(Event#{level => debug}, log);
_ ->
stop
end;
progress_filter(Event, _) ->
Event.
console_template() ->
[time, " [", level, "] " | msg()].
file_template() ->
[time, " [", level, "] ", pid,
{mfa, ["#", mfa, {line, [":", line], []}], []}, " " | msg()].
msg() ->
[{logger_formatter, [[logger_formatter, title], ":", io_lib:nl()], []},
msg, io_lib:nl()].
-spec reopen_log() -> ok.
reopen_log() ->
ok.
-spec rotate_log() -> ok.
rotate_log() ->
ok.
-spec get() -> loglevel().
get() ->
#{level := Level} = logger:get_primary_config(),
Level.
-spec set(0..5 | loglevel()) -> ok.
set(N) when is_integer(N), N>=0, N=<5 ->
set(convert_loglevel(N));
set(Level) when ?is_loglevel(Level) ->
case get() of
Level -> ok;
PrevLevel ->
?LOG_NOTICE("Changing loglevel from '~s' to '~s'",
[PrevLevel, Level]),
logger:set_primary_config(level, Level),
case Level of
debug -> xmpp:set_config([{debug, true}]);
_ -> xmpp:set_config([{debug, false}])
end
end.
-spec flush() -> ok.
flush() ->
lists:foreach(
fun(#{id := HandlerId, module := logger_std_h}) ->
logger_std_h:filesync(HandlerId);
(#{id := HandlerId, module := logger_disk_log_h}) ->
logger_disk_log_h:filesync(HandlerId);
(_) ->
ok
end, logger:get_handler_config()).
-endif.
Here, the first version is protected by -ifdef(LAGER). and thus gets used if the macro LAGER is defined, and the second version comes after '-else.', and gets used if LAGER is not defined. (The first version uses the Lager library for logging, and the second version uses the newer built-in logger library.)
I accidentally did (the equivalent of) the following:
lists:foldl(fun(X, Acc) -> [X|Acc] end, 0, List).
Note the not-a-list initial value for the accumulator.
This resulted in an improper list. This means that length, etc., don't work on it.
Given that my "equivalent of" took an hour to run, and I don't want to run it again, how do I repair my improper list?
For a simpler example of an improper list and the problem that it causes:
1> L = [1|[2|[3|4]]].
[1,2,3|4]
2> length(L).
** exception error: bad argument
in function length/1
called as length([1,2,3|4])
If you want to preserve the "improper tail", this would be enough:
Fix = fun Fix([H | T]) -> [H | Fix(T)];
Fix(T) -> [T]
end.
Here is a possible approach:
Lister = fun L([], Acc) -> lists:reverse(Acc);
L([[_ | _] = H | T], Acc) -> L(T, [L(H, []) | Acc]);
L([[] | T], Acc) -> L(T, Acc);
L([H | T], Acc) -> L(T, [H | Acc]);
L(X, Acc) -> L([], [X | Acc])
end.
L = [[[1,[1|2]],1|2],1|[2|[3|4]]].
Lister(L, []).
% output [[[1,[1,2]],1,2],1,2,3,4]
For the simple case I had, with non-nested improper list, where I don't want the extra item (because it should have been an empty list and doesn't mean anything), this'll do it:
Fix = fun F([H|T], A) when is_list(T) -> F(T, [H|A]);
F([H|_], A) -> F([], [H|A]);
F([], A) -> lists:reverse(A)
end.
Fix(L, []).
you must use list for ACC
lists:foldl(fun(X, Acc) -> [X|Acc] end, 0, [1,2,3]).
result => [3,2,1|0]
but if you use [0] for ACC argument in lists:foldl/3 function like bellow
lists:foldl(fun(X, Acc) -> [X|Acc] end, [0], [1,2,3]).
result => [3,2,1,0]
I am beginning Erlang and as an exercise I tried to implement the CYK algorithm.
Main code(cyk.erl):
%%% An attempt for a CYK parser in Erlang
-module(cyk).
-export([
init_storage/0,
import_grammar_file/1,
add_grammar_rule/1,
analyze/1,
test_analyze/0
]).
%% Initialize the ets storage for grammar
init_storage() ->
ets:new(?MODULE, [bag, named_table]).
%%------------------------------------------
%%
%% Grammar
%%
%%------------------------------------------
%% Import a grammar file
import_grammar_file(File) ->
{ok, Device} = file:open(File, read),
import_file_rules(Device).
%% Import all the rules in the file
import_file_rules(Device) ->
case io:get_line(Device, "") of
eof ->
io:format("Grammar file imported~n"),
file:close(Device);
Line ->
add_grammar_rule(Line),
import_file_rules(Device)
end.
%% Add a grammar rule
add_grammar_rule(Rule) ->
case re:run(Rule, "^([^\s]+)\s?->\s?([^\n]+)$", [{capture, all_but_first, binary}]) of
{match, [A, B]} ->
ets:insert(?MODULE, {A, B}),
io:format("parsing ~p -> ~p~n", [A, B]);
nomatch ->
io:format("cannot parse ~p~n", [Rule])
end.
%%------------------------------------------
%%
%% Main logic
%%
%%------------------------------------------
%% Analyze a sentence
analyze(Sentence) ->
io:format("analysing: ~p~n", [Sentence]),
WordList = re:split(Sentence, " "),
io:format("wordlist: ~p~n", [WordList]),
Representation = lists:map( fun(Word) -> associate(Word) end, WordList),
io:format("representation: ~p~n", [Representation]),
Result = process([Representation]),
io:format("result: ~p~n", [Result]).
% associate sentence words with grammar terms
associate(Word) ->
case ets:match(cyk, {'$1', Word}) of
[H|T] -> lists:flatten([H|T]);
[] -> []
end.
% process sentence representation
process(Representation) ->
Limit = length(lists:last(Representation)),
process(Representation, Limit).
process(Representation, Limit) when Limit > 1 ->
NextStep = process(Representation, 1, Limit-1, []),
process([NextStep|Representation], Limit-1);
process(Representation, _Limit) ->
Representation.
process(Representation, Index, Limit, Acc) when Index =< Limit ->
Subtree = extract_subtree(lists:reverse(Representation), Index),
Result = process_subtree(Subtree),
process(Representation, Index+1, Limit, [Result|Acc]);
process(_Representation, _Index, _Limit, Acc) ->
lists:reverse(Acc).
%%------------------------------------------
%%
%% Subtree
%%
%%------------------------------------------
process_subtree(Subtree) ->
process_subtree(Subtree, Subtree, [], 1).
process_subtree([], _Subtree, Acc, _Index) ->
Acc;
process_subtree([H|T], Subtree, Acc, Index) ->
A = lists:nth(1,H),
Bind = length( Subtree ) - Index + 1,
B = lists:last( lists:nth( Bind, Subtree) ),
% generating the possibilities of grammar
Pos = [ list_to_binary(binary:bin_to_list(X)++" "++binary:bin_to_list(Y)) || X<-A, Y<-B ],
% looking up in the grammar
Result = lists:flatten( [ ets:match(cyk, {'$1', X}) || X <- Pos ] ),
process_subtree(T, Subtree, Acc++Result, Index + 1).
%% Extract a subtree from the representation
extract_subtree(Representation, Position) ->
Size = length(Representation) + 1,
extract_subtree(Representation, Size, Position, []).
extract_subtree([], _Size, _Position, Acc) ->
lists:reverse(Acc);
extract_subtree([H|T], Size, Position, Acc) ->
Segment = lists:sublist(H, Position, Size),
extract_subtree(T, Size - 1, Position, [Segment|Acc]).
%%------------------------------------------
%%
%% Test
%% using the same example as
%% http://en.wikipedia.org/wiki/CYK_algorithm
%%
%%------------------------------------------
test_analyze() ->
init_storage(),
import_grammar_file("grammar.txt"),
analyze("she eats a fish with a fork").
The grammar file (grammar.txt)
S -> NP VP
VP -> VP PP
VP -> V NP
VP -> eats
PP -> P NP
NP -> Det N
NP -> she
V -> eats
P -> with
N -> fish
N -> fork
Det -> a
The code can be tested from the erlang shell
> c(cyk).
> cyk:test_analyze().
parsing <<"S">> -> <<"NP VP">>
parsing <<"VP">> -> <<"VP PP">>
parsing <<"VP">> -> <<"V NP">>
parsing <<"VP">> -> <<"eats">>
parsing <<"PP">> -> <<"P NP">>
parsing <<"NP">> -> <<"Det N">>
parsing <<"NP">> -> <<"she">>
parsing <<"V">> -> <<"eats">>
parsing <<"P">> -> <<"with">>
parsing <<"N">> -> <<"fish">>
parsing <<"N">> -> <<"fork">>
parsing <<"Det">> -> <<"a">>
Grammar file imported
analysing: "she eats a fish with a fork"
wordlist: [<<"she">>,<<"eats">>,<<"a">>,<<"fish">>,<<"with">>,<<"a">>,
<<"fork">>]
representation: [[<<"NP">>],
[<<"VP">>,<<"V">>],
[<<"Det">>],
[<<"N">>],
[<<"P">>],
[<<"Det">>],
[<<"N">>]]
result: [[[<<"S">>]],
[[],[<<"VP">>]],
[[],[],[]],
[[<<"S">>],[],[],[]],
[[],[<<"VP">>],[],[],[<<"PP">>]],
[[<<"S">>],[],[<<"NP">>],[],[],[<<"NP">>]],
[[<<"NP">>],
[<<"VP">>,<<"V">>],
[<<"Det">>],
[<<"N">>],
[<<"P">>],
[<<"Det">>],
[<<"N">>]]]
The code seems to work fine for this example, but I was looking for ways to improve it (make it more erlang-ish) and specially to make the processing distributed on multiple process/nodes.
I guess all the process_subtree executions for each step could be done concurrent, but I can't really figure how.
Any suggestions will be greatly appreciated!
I have written this solution which use concurrent execution.
Compare to Eric solution, some changes were needed for the usage of multi-processes, some other because I think it is more efficient (I reverted keys and values in the rules ets, and I have chosen a set), some because I think it is cleaner (I close the grammar file in the function that open it) and some because I am more familiar with these modules (string:tokens ...).
[edit]
I have replaced a useless spawn by faster recursive call, and suppressed the wait function by adding a message to synchronize the processes.
I got the idea of this implementation looking at the nice animation at a Javascript animation of the CYK algorithm, which is unfortunately no longer available.
#Eric, it is possible to look at all steps of the analysis opening the ets analyze with observer, it is why I do not delete it.
-module(cyk).
-export([
import_grammar_file/1,
add_grammar_rule/2,
analyze/1,
test_analyze/1,
test_analyze/0
]).
%%------------------------------------------
%%
%% Grammar
%%
%%------------------------------------------
%% Import a grammar file
import_grammar_file(File) ->
reset_ets(rules, ets:info(rules)),
{ok, Device} = file:open(File, read),
ok = add_grammar_rule(Device,file:read_line(Device)),
file:close(Device),
io:format("Grammar file imported~n").
%% Add a grammar rule
add_grammar_rule(_,eof) -> ok;
add_grammar_rule(Device,{ok,Rule}) ->
[T,"->",H|Q] = string:tokens(Rule," \n"),
Key = key(H,Q),
insert(Key,T,ets:lookup(rules, Key)),
add_grammar_rule(Device,file:read_line(Device)).
key(H,[]) -> H;
key(H,[Q]) -> {H,Q}.
insert(Key,T,[]) -> ets:insert(rules, {Key,[T]});
insert(Key,T,[{Key,L}]) -> ets:insert(rules, {Key,[T|L]}).
%%------------------------------------------
%%
%% Main logic
%%
%%------------------------------------------
%% Analyze a sentence
analyze(Sentence) ->
reset_ets(analyze, ets:info(analyze)),
io:format("analysing: ~p~n", [Sentence]),
WordList = string:tokens(Sentence, " "),
Len = length(WordList),
Me = self(),
lists:foldl(fun(X,{J,Pid}) -> ets:insert(analyze,{{0,J},ets:lookup_element(rules,X,2)}),
(NewPid = spawn(fun() -> whatis(1,J,Len,Pid,Me) end)) ! {done,0},
{J+1,NewPid} end,
{1,none}, WordList),
receive
M -> M
end.
reset_ets(Name, undefined) -> ets:new(Name,[set, named_table,public]);
reset_ets(Name, _) -> ets:delete_all_objects(Name).
whatis(Len,1,Len,_,PidRet) -> PidRet ! ets:lookup_element(analyze,{Len-1,1},2); % finished
whatis(I,J,Len,_,_) when I + J == Len +1 -> ok; % ends useless processes
whatis(I,J,Len,Pid,PidRet) ->
receive {done,V} when V == I-1 -> ok end,
Cases = lists:map(fun({X,Y}) -> [{A,B} || A <- ets:lookup_element(analyze,X,2),
B <- ets:lookup_element(analyze,Y,2)] end,
[{{X-1,J},{I-X,J+X}} || X <- lists:seq(1,I)]),
Val = lists:foldl(fun(X,Acc) -> case ets:lookup(rules,X) of
[] -> Acc;
[{_,[R]}] -> [R|Acc]
end end,
[],lists:flatten(Cases)),
ets:insert(analyze,{{I,J},Val}),
send(Pid,I),
whatis(I+1,J,Len,Pid,PidRet).
send(none,_) -> ok;
send(Pid,I) -> Pid ! {done,I}.
%%------------------------------------------
%%
%% Test
%% using the same example as
%% http://en.wikipedia.org/wiki/CYK_algorithm
%%
%%------------------------------------------
test_analyze(S) ->
import_grammar_file("grammar.txt"),
analyze(S).
test_analyze() ->
test_analyze("she eats a fish with a fork").
I have a simple record structure consisting of a header (H) and a list of the data lines (D) 1:N. All header lines must start with a digit. All data lines have a leading whitespace. There also might be some empty lines (E) in between that must be ignored.
L = [H, D, D, E, H, D, E, H, D, D, D].
I would like to create a list of records:
-record(posting,{header,data}).
using list comprehension. Whats the best way to do it?
You must use lists:foldl/3 instead of list comprehensions in this case. With foldl/3 you can accumulate values of header and data through whole list L.
You should do something like this:
make_records(L) when is_list(L) ->
F = fun([32|_]=D,{#posting{}=H,Acc}) -> {H,[H#posting{data=D}|Acc]};
([], Acc) -> Acc;
([F|_]=H, {_,Acc}) when F=<$0, F>=$9 -> {#posting{header=>H}, Acc}
end,
{_, R} = lists:foldl(F, {undefined, []}, L),
R.
Anyway I think that straightforward Erlang version doesn't seems too complicated and should be little bit faster.
make_records2(L) when is_list(L) ->
make_records2(L, undefined, []).
make_records2([], _, R) -> R;
make_records2([[32|_]=D|T], H, Acc) when is_list(H) ->
make_records2(T, H, [#posting{header=H,data=D}|Acc]);
make_records2([[]|T], H, Acc) ->
make_records2(T, H, Acc);
make_records2([[F|_]=H|T], _, Acc) when F>=$0, F=<$9 ->
make_records2(T, H, Acc).
Edit: If you have to add better row classification or parsing, adding new function is better because it improves readability.
parse_row([Digit|_]=R) when Digit >= $0, Digit =< $9 -> {header, R};
parse_row(R) -> try_spaces(R).
try_spaces([]) -> empty;
try_spaces([Sp|R]) when Sp=:=$\s; Sp=:=$\t; Sp=:=$\n ->
try_spaces(R); % skip all white spaces from Data field
try_spaces(Data) -> {data, Data}.
You can use it like this:
make_records(L) when is_list(L) ->
F = fun(Row, {H, Acc}) ->
case parse_row(Row) of
{data, D} when is_record(H, posting) -> {H,[H#posting{data=D}|Acc]};
empty -> Acc;
{header, H} -> {#posting{header=>H}, Acc}
end,
{_, R} = lists:foldl(F, {undefined, []}, L),
R.
Tail recursive native Erlang solution:
make_records2(L) when is_list(L) ->
make_records2([parse_row(R) || R<-L], undefined, []).
make_records2([], _, R) -> R;
make_records2([{data, D}|T], H, Acc) when is_list(H) ->
make_records2(T, H, [#posting{header=H,data=D}|Acc]);
make_records2([empty|T], H, Acc) ->
make_records2(T, H, Acc);
make_records2([{header,H}|T], _, Acc) ->
make_records2(T, H, Acc).
I think that there is no reason use tail recursion from performance point of view:
make_records3(L) when is_list(L) ->
make_records3(L, undefined).
make_records3([], _) -> [];
make_records3([R|T], H) ->
case parse_row(R) of
{data, D} when is_list(H) -> [#posting{head=H,data=D}|make_records3(T, H)];
empty -> make_records3(T, H);
{header, H2} -> make_records3(T, H2)
end.
... and many many other variants.
I needed to collapse all Data lines beneath the header - so for the moment here is what I have:
sanitize(S) -> trim:trim(S).
make_records(L) when is_list(L) -> make_records(L, undefined, []).
make_records([], _, R) -> lists:reverse(R);
make_records([[32|_]=D|T], H, Acc) when is_tuple(H) ->
make_records(T, {element(1,H),[sanitize(D)|element(2,H)]},Acc);
make_records([[$\n|_]=D|T], H, Acc) when is_tuple(H) ->
make_records(T, H, Acc);
make_records([[F|_]=H|T], B, Acc) when F>=$0, F=<$9 ->
if is_tuple(B) ->
make_records(T, {sanitize(H),[]}, [#posting{header=element(1,B),
data=lists:reverse(element(2,B))}|Acc]);
true ->
make_records(T, {sanitize(H),[]}, Acc)
end.
What is the most efficient way from the time consumed to read a text file into a list of binary strings in erlang ? The obvious solution
-module(test).
-export([run/1]).
open_file(FileName, Mode) ->
{ok, Device} = file:open(FileName, [Mode, binary]),
Device.
close_file(Device) ->
ok = file:close(Device).
read_lines(Device, L) ->
case io:get_line(Device, L) of
eof ->
lists:reverse(L);
String ->
read_lines(Device, [String | L])
end.
run(InputFileName) ->
Device = open_file(InputFileName, read),
Data = read_lines(Device, []),
close_file(Device),
io:format("Read ~p lines~n", [length(Data)]).
becomes too slow when the file contains more than 100000 lines.
{ok, Bin} = file:read_file(Filename).
or if you need the contents line by line
read(File) ->
case file:read_line(File) of
{ok, Data} -> [Data | read(File)];
eof -> []
end.
read the entire file in into a binary. Convert to a list and rip out the lines.
This is far more efficient than any other method. If you don't believe me time
it.
file2lines(File) ->
{ok, Bin} = file:read_file(File),
string2lines(binary_to_list(bin), []).
string2lines("\n" ++ Str, Acc) -> [reverse([$\n|Acc]) | string2lines(Str,[])];
string2lines([H|T], Acc) -> string2lines(T, [H|Acc]);
string2lines([], Acc) -> [reverse(Acc)].