ejabberd online status when user loses connection - erlang

I have ejabberd setup to be the xmpp server between mobile apps, ie. custom iPhone and Android app.
But I've seemingly run into a limitation of the way ejabberd handles online status's.
Scenario:
User A is messaging User B via their mobiles.
User B loses all connectivity, so client can't disconnect from server.
ejabberd still lists User B as online.
Since ejabberd assumes User B is still online, any message from User A gets passed on to the dead connection.
So user B won't get the message, nor does it get saved as an offline message, as ejabberd assumes the user is online.
Message lost.
Until ejabberd realises that the connection is stale, it treats it as an online user.
And throw in data connection changes (wifi to 3G to 4G to...) and you'll find this happening quite a lot.
mod_ping:
I tried to implement mod_ping on a 10 second interval.
https://www.process-one.net/docs/ejabberd/guide_en.html#modping
But as the documentation states, the ping will wait 32 seconds for a response before disconnecting the user.
This means there will be a 42 second window where the user can lose their messages.
Ideal Solution:
Even if the ping wait time could be reduce, it's still not a perfect solution.
Is there a way that ejabberd can wait for a 200 response from the client before discarding the message? If no response then save it offline.
Is it possible to write a hook to solve this problem?
Or is there a simple setting I've missed somewhere?
FYI: I am not using BOSH.

Here is the mod I wrote that fixes my problem.
To make it work you'll need receipts to be activated client side and the client should be able to handle duplicate messages.
Firstly I created a table called confirm_delivery. I save every 'chat' message to that table. I set a 10 second timer, if I receive a confirmation back, I delete the table entry.
If I don't get a confirmation back, I save the message manually to the offline_msg table and try and resend it again (this might be over the top, but for you to decide) and then delete it from our confirm_delivery table
I've chopped out all the code I perceive as unnecessary, so I hope this will still compile.
Hope this is of help to other ejabberd devs out there!
https://github.com/johanvorster/ejabberd_confirm_delivery.git
%% name of module must match file name
-module(mod_confirm_delivery).
-author("Johan Vorster").
%% Every ejabberd module implements the gen_mod behavior
%% The gen_mod behavior requires two functions: start/2 and stop/1
-behaviour(gen_mod).
%% public methods for this module
-export([start/2, stop/1, send_packet/3, receive_packet/4, get_session/5, set_offline_message/5]).
%% included for writing to ejabberd log file
-include("ejabberd.hrl").
-record(session, {sid, usr, us, priority, info}).
-record(offline_msg, {us, timestamp, expire, from, to, packet}).
-record(confirm_delivery, {messageid, timerref}).
start(_Host, _Opt) ->
?INFO_MSG("mod_confirm_delivery loading", []),
mnesia:create_table(confirm_delivery,
[{attributes, record_info(fields, confirm_delivery)}]),
mnesia:clear_table(confirm_delivery),
?INFO_MSG("created timer ref table", []),
?INFO_MSG("start user_send_packet hook", []),
ejabberd_hooks:add(user_send_packet, _Host, ?MODULE, send_packet, 50),
?INFO_MSG("start user_receive_packet hook", []),
ejabberd_hooks:add(user_receive_packet, _Host, ?MODULE, receive_packet, 50).
stop(_Host) ->
?INFO_MSG("stopping mod_confirm_delivery", []),
ejabberd_hooks:delete(user_send_packet, _Host, ?MODULE, send_packet, 50),
ejabberd_hooks:delete(user_receive_packet, _Host, ?MODULE, receive_packet, 50).
send_packet(From, To, Packet) ->
?INFO_MSG("send_packet FromJID ~p ToJID ~p Packet ~p~n",[From, To, Packet]),
Type = xml:get_tag_attr_s("type", Packet),
?INFO_MSG("Message Type ~p~n",[Type]),
Body = xml:get_path_s(Packet, [{elem, "body"}, cdata]),
?INFO_MSG("Message Body ~p~n",[Body]),
MessageId = xml:get_tag_attr_s("id", Packet),
?INFO_MSG("send_packet MessageId ~p~n",[MessageId]),
LUser = element(2, To),
?INFO_MSG("send_packet LUser ~p~n",[LUser]),
LServer = element(3, To),
?INFO_MSG("send_packet LServer ~p~n",[LServer]),
Sessions = mnesia:dirty_index_read(session, {LUser, LServer}, #session.us),
?INFO_MSG("Session: ~p~n",[Sessions]),
case Type =:= "chat" andalso Body =/= [] andalso Sessions =/= [] of
true ->
{ok, Ref} = timer:apply_after(10000, mod_confirm_delivery, get_session, [LUser, LServer, From, To, Packet]),
?INFO_MSG("Saving To ~p Ref ~p~n",[MessageId, Ref]),
F = fun() ->
mnesia:write(#confirm_delivery{messageid=MessageId, timerref=Ref})
end,
mnesia:transaction(F);
_ ->
ok
end.
receive_packet(_JID, From, To, Packet) ->
?INFO_MSG("receive_packet JID: ~p From: ~p To: ~p Packet: ~p~n",[_JID, From, To, Packet]),
Received = xml:get_subtag(Packet, "received"),
?INFO_MSG("receive_packet Received Tag ~p~n",[Received]),
if Received =/= false andalso Received =/= [] ->
MessageId = xml:get_tag_attr_s("id", Received),
?INFO_MSG("receive_packet MessageId ~p~n",[MessageId]);
true ->
MessageId = []
end,
if MessageId =/= [] ->
Record = mnesia:dirty_read(confirm_delivery, MessageId),
?INFO_MSG("receive_packet Record: ~p~n",[Record]);
true ->
Record = []
end,
if Record =/= [] ->
[R] = Record,
?INFO_MSG("receive_packet Record Elements ~p~n",[R]),
Ref = element(3, R),
?INFO_MSG("receive_packet Cancel Timer ~p~n",[Ref]),
timer:cancel(Ref),
mnesia:dirty_delete(confirm_delivery, MessageId),
?INFO_MSG("confirm_delivery clean up",[]);
true ->
ok
end.
get_session(User, Server, From, To, Packet) ->
?INFO_MSG("get_session User: ~p Server: ~p From: ~p To ~p Packet ~p~n",[User, Server, From, To, Packet]),
ejabberd_router:route(From, To, Packet),
?INFO_MSG("Resend message",[]),
set_offline_message(User, Server, From, To, Packet),
?INFO_MSG("Set offline message",[]),
MessageId = xml:get_tag_attr_s("id", Packet),
?INFO_MSG("get_session MessageId ~p~n",[MessageId]),
case MessageId =/= [] of
true ->
mnesia:dirty_delete(confirm_delivery, MessageId),
?INFO_MSG("confirm_delivery clean up",[]);
_ ->
ok
end.
set_offline_message(User, Server, From, To, Packet) ->
?INFO_MSG("set_offline_message User: ~p Server: ~p From: ~p To ~p Packet ~p~n",[User, Server, From, To, Packet]),
F = fun() ->
mnesia:write(#offline_msg{us = {User, Server}, timestamp = now(), expire = "never", from = From, to = To, packet = Packet})
end,
mnesia:transaction(F).

This is well known limitation of TCP connections. You need to introduce some acknowledgment functionality.
One of options in xep-0184. A message may carry receipt request and when it is delivered the receipt goes back to sender.
Another option is xep-0198. This is stream management which acknowledges stanzas.
You can also implement it entirely in application layer and send messages from recipient to sender.
Act accordingly when acknowledgment is not delivered.
Mind that Sender -> Server connection also may be severed in that way.
I am not aware of implementation of those xeps and features in ejabberd. I implemented them on my own depending on project requirements.

ejabberd supports stream management as default in latest version. It is implemented in most mobile libraries like Smack for Android and XMPPFramework for iOS.
This is the state of the art in XMPP specification at the moment.

Implementing XEP-198 on ejabberd is quite involved.
Erlang Solutions (I work for them) has an XEP-184 module for ejabberd, with enhanced functionality, that solves this problem. It does the buffering and validation on the server side. As long as client sends messages carrying receipt request and when it is delivered the receipt goes back to sender.
The module validates receipts to see if message has been received. If it hasn't within timeout, it gets saved as an offline message.

I think the better way is that if a message has not be received make user offline and then store message in offline message table and use a push service and configure it for offline message.
Then a push will be send and if there are more message they will be stored on offline message, and for understanding on server that message has not received you can use this https://github.com/Mingism/ejabberd-stanza-ack.
I think Facebook has the same way when a message doesn't deliver it makes user offline until he become online again

Ejabberd supports stream management as default in latest version.
After set stream manager config in ejabberd_c2s, You should set some config in your client.
Please see this post for this config in client.
https://community.igniterealtime.org/thread/55715

Related

gen_server:reply/2: format of message sent to client

When I call gen_server:reply/2:
gen_server:reply(From, Msg),
the client, From, receives a message with the format:
{Ref, Msg)
I can't find any documentation for the message format sent by gen_server:reply/2, and I'm wondering how I can pattern match the Ref in the message. Currently, I use a don't care variable for the Ref:
receive
{_Ref, Msg} -> Msg;
Other -> Other
end
which means that a process other than the gen_server could potentially send my client a message that would match the {_Ref, Msg} clause.
In the call gen_server:reply(From, Msg), From is not simply the client: it is in fact a tuple containing two values, the process id of the caller and a unique reference. We can see this in the implementation of gen_server:reply/2:
%% -----------------------------------------------------------------
%% Send a reply to the client.
%% -----------------------------------------------------------------
reply({To, Tag}, Reply) ->
catch To ! {Tag, Reply}.
The idea is that Tag is a unique value provided by the caller, so that the caller can distinguish the result from this call from any other incoming message:
Ref = make_ref(),
MyServer ! {'$gen_call', {self(), Ref}, foo},
receive
{Ref, Reply} -> io:format("Result of foo call: ~p~n", [Reply])
end
In the code above, the receive will block until it gets a response to this very call.
(gen_server:call/2 does something like the above, and additionally monitors the server in case it crashes, and checks for timeouts.)
The reason this is undocumented is that it is considered an internal implementation detail subject to change, and users are advised to rely on gen_server:call and gen_server:reply instead of generating and matching the messages themselves.
Most of the time you wouldn't need to use gen_server:reply/2 at all: the server process receives a call and handles it synchronously, returning a reply tuple:
handle_call(foo, _From, State) ->
%% ignoring 'From' here, because we're replying immediately
{reply, foo_result, State}.
But sometimes you'd want the server process to delay replying to the call, for example waiting for network input:
handle_call(foo, From, State) ->
send_request(foo),
NewState = State#state{pending_request = From},
{noreply, NewState}.
handle_info({received_response, Response}, State = #state{pending_request = From}) ->
gen_server:reply(From, Response),
NewState = State#state{pending_request = undefined},
{noreply, NewState}.
In the example above, we save the From value in the server state, and when the response comes in as an Erlang message, we forward it to the caller, which will block until it gets the response. (A more realistic example would handle multiple requests concurrently and match incoming responses to outstanding requests somehow.)
It is a gen.erl feature used by gen_* behaviors. You can see gen_event's call, gen_server's call and gen_statem's call.
So how does it work?
The Idea is simple, when you call gen:call/4 or gen:call(Process, Label, Request, Timeout), It monitors Process. So erlang:monitor/2 yields a reference. It uses this reference and sends message to Process in form of {Label, {self(), Ref}, Request}. After that it waits for {Ref, Reply} for specified Timeout and after receiving reply it demonitors Process. Also if Process crashes during sending Reply or even if Process was a dead pid before call, it receives {'DOWN', Ref, _, _, Reason}.
For example gen_server:call/2-3 call gen:call(Prpcess, '$gen_call', Req, Timeout). When server Process (which is a gen_server) receives it, It assumes that it's a call request, so calls your handle_call function, etc.

Erlang : Use of `Ref` in query and response

I am reading LearnYouSomeErlang and found the code below :
My question is that why do we need Ref in event function.
I think the reason given is that Ref is like request-id.
if i send multiple requests to dont_give_crap and when i get back replies, Ref in response tells me which id it is.
However, since event is blocking and i am sending only one request, in which situation, will i be able to send multiple requests from same process to dont_give_crap process ? What is the purpose of Ref ?
-module(cat_fsm).
-export([start/0, event/2]).
start() ->
spawn(fun() -> dont_give_crap() end).
event(Pid, Event) ->
Ref = make_ref(), % won't care for monitors here
Pid ! {self(), Ref, Event},
receive
{Ref, Msg} -> {ok, Msg}
after 5000 ->
{error, timeout}
end.
dont_give_crap() ->
receive
{Pid, Ref, _Msg} -> Pid ! {Ref, meh};
_ -> ok
end,
io:format("Switching to 'dont_give_crap' state~n"),
dont_give_crap().
My question is that why do we need Ref
It's like a verification code. Any other process can send your process a message if it has your process's Pid.
I think the reason given is that Ref is like request-id.
Ref is more like a reply-id. It's an id that a process expects to receive in a reply. If you write:
receive
Msg -> %%do something
end
then any message sent to your process's mailbox will match that pattern, and you won't know where it came from. And if you write:
receive
{Pid, Msg} -> %%do something
end
you still can't be sure that the message came from process Pid. Another one of your processes might have multiple Pids it is replying to and accidentally used the wrong Pid for the message.
But if you send a Ref with your request to process Pid, then process Pid can reply with a message that contains the Ref it received and its Pid, then you can extract the message containing that Ref and the sender's Pid from your mailbox and be sure it came from process Pid (well, actually process Pid could send the Ref to some other process, so you still can't be absolutely certain).
If you had an erlang application with 1,000's of processes sending 100's of messages to each other, it might be more error prone if you only verified where replies came from by Pid.
Edit: I was just reading about gen_udp and sockets, and it turns out that some servers can duplicate a reply. Suppose that happens, and a client extracts the first reply from the mailbox like this:
receive
{Pid, Msg1} -> Msg1
end
Then the client sends a second request and waits for the reply:
receive
{Pid, Msg2} -> Msg2
end
Well, that pattern will extract the duplicate reply to the first request--not the reply to the second request. But if a unique Ref and been sent with both the first and second requests:
Pid = ...,
Ref1 = ...,
Pid ! {self(), Ref1, Msg1},
receive
{Pid, Ref1, Reply1} -> Reply1
end,
Ref2 = ...,
Pid ! {self(), Ref2, Msg2},
receive
{Pid, Ref2, Reply2} -> Reply2
end.
then the second receive would not match a duplicate reply to the first request. That is a concrete example of a good use of a Ref.
Use of Ref in query and response
Each process has their own mailbox like post box at home. Mailbox is the entry point for sending any request to any process.
As per your code you want a response back from a particular process. So
you have to specify expected process pid in the place of message extraction from mailbox.
Will i be able to send multiple requests from same process to dont_give_crap process ? What is the purpose of Ref ?
which is not possible. To send multiple request from process A to process B. A -> B, i.e) If A sends a blocking message to B then A is in a block, means it will not accept any other message from any other process.
Each message will differ by the pattern match of message structure

Data streaming using streamcontent_from_pid in Yaws/Erlang

I desire to stream data with yaws to my comet application, I have read and worked around to understand it but the example from yaws seems to be a little complicated for me (I am new to Erlang). I just cannot get my head around...
here is the example from yaws (I modified a little bit):
out(A) ->
%% Create a random number
{_A1, A2, A3} = now(),
random:seed(erlang:phash(node(), 1),
erlang:phash(A2, A3),
A3),
Sz = random:uniform(1),
Pid = spawn(fun() ->
%% Read random junk
S="Hello World",
P = open_port({spawn, S}, [binary,stream, eof]),
rec_loop(A#arg.clisock, P)
end),
[{header, {content_length, Sz}},
{streamcontent_from_pid, "text/html; charset=utf-8", Pid}].
rec_loop(Sock, P) ->
receive
{discard, YawsPid} ->
yaws_api:stream_process_end(Sock, YawsPid);
{ok, YawsPid} ->
rec_loop(Sock, YawsPid, P)
end,
port_close(P),
exit(normal).
rec_loop(Sock, YawsPid, P) ->
receive
{P, {data, BinData}} ->
yaws_api:stream_process_deliver(Sock, BinData),
rec_loop(Sock, YawsPid, P);
{P, eof} ->
yaws_api:stream_process_end(Sock, YawsPid)
end.
What I need is to transform the above script to which can be combined with the following.
mysql:start_link(p1, "127.0.0.1", "root", "azzkikr", "mydb"),
{data, Results} = mysql:fetch(p1, "SELECT*FROM messages WHERE id > " ++ LASTID),
{mysql_result, FieldNames, FieldValues, NoneA, NoneB} = Results,
parse_data(FieldValues, [], [], [], [], [])
Where parse_data(FieldValues, [], [], [], [], []) returns a JSON string of the entry..
Combined this script should constantly check for a new entry into database and if there is, it should fetch as comet should.
Thank you, May you all go to paradise!
As this answer explains, sometimes you need to have a process running that's independent of any incoming HTTP requests. For your case, you can use a form of publish/subscribe:
Publisher: when your Erlang node starts up, start some sort of database client process, or a pool of such processes, executing your query and running independently of Yaws.
Subscriber: when Yaws receives an HTTP request and dispatches it to your code, your code subscribes to the publisher. When the publisher sends data to the subscriber, the subscriber streams them back to the HTTP client.
Detailing a full solution here is impractical, but the general steps are:
When your database client processes start, they register themselves into a pg2 group or something similar. Use something like poolboy instead of rolling your own process pools, as they're notoriously tricky to get right. Each database client can be an instance of a gen_server running a query, receiving database results, and also handling subscription request calls.
When your Yaws code receives a request, it looks up a database client publisher process and subscribes to it. Subscriptions require calling a function in the database client module, which in turn uses gen_server:call/2,3 to communicate with the actual gen_server publisher process. The subscriber uses Yaws streaming capabilities (or SSE or WebSocket) to complete the connection with the HTTP client and sends it any required response headers.
The publisher stores the process ID of the subscriber, and also establishes a monitor on the subscriber so it can clean up the subscription should the subscriber die or exit unexpectedly.
The publisher uses the monitor's reference as a unique ID in its messages it sends to that subscriber, so the subscription function returns that reference to the subscriber. The subscriber uses the reference to match incoming messages from the publisher.
When the publisher gets new query results from the database, it sends the data to each of its subscribers. This can be done with normal Erlang messages.
The subscriber uses Yaws streaming functions (or SSE or WebSocket features) to send query results to the HTTP client.
When an HTTP client disconnects, the subscriber calls another publisher function to unsubscribe.

Erlang: inter-application communication

To continue on with my journey to Erlands I'm developing simple IM system using OTP.
There are two OTP applications: a server (one instance) and a client (multiple instances). A set-up is shown below:
╭── node1#host ──╮
│ Server │
│ └gen_server │
╰────────────────╯
╭── node2#host ──╮
│ Client │
│ └gen_server │
╰────────────────╯
╭── node3#host ──╮
│ Client │
│ └gen_server │
╰────────────────╯
...
Client functions
Using Erlang shell, we can issue next commands to the client application:
Connect to the server and receive a random name from it (I'm fond of names like turbo-octopus, miniature-octocat etc. :)
Get a list of other connected clients.
Send a message to the client with specified name.
Send a message to all clients (broadcast).
Also client should be able to print message in stdout upon receiving.
Implementation details
All messages go through server.
Both server and client applications contain gen_servers (chat_server.erl and chat_client.erl respectively) responsible for handling messages. Server's chat_server process registered as global and visible on all nodes:
%% chat_server.erl
start_link() ->
gen_server:start_link({global, ?SERVER}, ?MODULE, [], []).
When a client connects, it sends pid of its gen_server process. Doing this, we can store clients pids in server's state to distinguish them and sending/broadcasting messages.
%% chat_client.erl
connect() ->
Res = gen_server:call({global, ?REMOTE_SERVER}, {connect, client_pid() ...}),
...
%% pid of the client's gen_server
client_pid() -> whereis(?CLIENT_SERVER).
Server connect handle:
%% chat_server.erl
handle_call({connect, Pid}, _From, State) ->
%% doing stuff like generating unique name,
%% adding client to list, etc.
{reply, {connected, Name}, UpdatedState}.
Messaging (pun intended)
Well, it's pretty straightforward. The server handles cast from a client, seeks recipient's pid by given name and cast message to it/broadcasts to everyone. And this is it.
A Question will be asked
While developing this system, I wondered if a chosen approach is appropriate. I mean,
Passing around client's gen_server pid seems more or less acceptable at least because it allows uniquely identify clients and use all gen_server firepower on both ends. Is this the way you do it?
I have read here and there that explicit interface (calling exported functions) is preferable to direct messaging (thing I do in my client with gen_server:calls). Is there any way to fix this (rpc for example), or it's okay?
Given the same set-up (a node with a server application and N nodes with clients), will you use the same approach with gen_servers, or there is a better approach I'm unaware of?
Personally I think your architecture is slightly off.
If you want your client to accept incoming messages (e.g. when another client is sending a message to you or a doing a broadcast) then there currently doesn't seem to be a process where the server can send the messages to. gen_server is typically not the vehicle for that; it's mainly for server processes.
I think the idea should be to start a new process for every client. The process will become that specific client's main loop. If you (the user) wants to do something, you send a message to that specific process. That can be hidden behind function calls. Then the client's main loop will interact with the server.
The client's main loop - which is a separate process, is always ready to receive messages, so the server can send messages to your client if someone is sending to you.
BTW: I hope that your defines ?SERVER and ?REMOTE_SERVER are identical because, if I understand correctly, they both refer to the globally registered chat server. Better stick with one unique name.
Another issue is that you do not typically expose the gen_server:call() methods. The clients only call methods in the chat_server module without knowing what the name of the server is or whereever it lives (that's the beauty of Erlang!).
In chat_server.erl you put code like this; basically the client API. You will notice that in chat_client.erl there will only be calls to methods in the chat_server module. Very clean and transparent!
%% let a new client connect, all we need is it's Pid
new_client(Pid) ->
gen_server:call({global, ?SERVER}, {connect, Pid}).
send_msg(From, To) ->
gen_server:call({global, ?SERVER}, {sendmsg, From, To}).
logout_client(Pid) ->
gen_srver:call({global, ?SERVER), {exit_client, Pid}).
The client code below (deliberately) does not automatically register the client's Pid, unless you restrict your system to allow exactly only one client per node. You cannot register more than one Pid under the same name.
The code below does not register the new Pid as a name but it can be made to do so trivially, if that is what you desire or need.
Typically the client's code looks like:
%% start a new client, we spawn a new process for this
%% particular client and return their Pid, to be used
%% when you want your client to do something
connect(Server) ->
spawn( ?MODULE, start_client, [] ).
%% client startup code
start_client() ->
%% Initialize client state, if you wish
State = 42,
%% Now connect to chat server
chat_server:new_client( self() ),
%% And fall into our own main loop
client_loop( State ).
%% This is the client's main loop
client_loop( State ) ->
%% Wait for stuff to happen ...
receive
%% chat server sends message to us
{message , Msg, From} ->
io:format("~p sais ~p~n", [From, Msg]),
client_loop( State );
%% message sending is delegated to the server - see your own protocol
{send, Msg, To} ->
chat_server:send_msg(Msg, To),
client_loop( State );
%% terminate?
done ->
%% de-register with server
chat_server:logout_client(self())
end.
Now all that's needed is some utility functions to interact with your client process like below. Note that if you go to "each Erlang node is a single client" by registering the client's Pid locally, you can get rid of passing the Pid explicitly. But the mechanics remain the same.
send_message(Pid, Msg, To) ->
Pid ! {send, Msg, To}.
logout(Pid) ->
Pid ! done.
%% If you force your client's Pid to be registered to e.g. 'registered_name'
%% it would look like
send_message(Msg, To) ->
registered_name ! {send, Msg, To}.
I agree with haavee that your architecture is not what I'd expect, but that's because I'm imagining something more low-level-TCP.
Regarding your questions:
Passing around client's gen_server pid seems more or less acceptable
at least because it allows uniquely identify clients and use all
gen_server firepower on both ends. Is this the way you do it?
Sure, I see nothing wrong with that part of the code. Your server keeps a mapping between PID and client name and it's like calling register/2 but only the server gets the mapping and you control how it works.
I have read here and there that explicit interface (calling exported
functions) is preferable to direct messaging (thing I do in my client
with gen_server:calls). Is there any way to fix this, or it's okay?
If you compiled your client and server applications together (one code-base, two entry-points) then you could do this. Instead of, on the client-side doing
connect() ->
Res = gen_server:call({global, ?REMOTE_SERVER}, {connect, client_pid() ...}),
you'd have
-module(client).
connect() ->
server:client_connect(client_pid()).
and
-module(server).
client_connect(ClientPID) ->
Res = gen_server:call({global, ?REMOTE_SERVER}, {connect, ClientPID ...}).
But if you want to use net_kernel to connect nodes and you want to compile the source code independently, then your way is how you do it.
Given the same set-up (a node with a server application and N nodes
with clients), will you use the same approach with gen_servers, or
there is a better approach I'm unaware of?
What you're doing with net_kernel is building a distributed system. If you expect a few clients, that's fine. If you expect a ton of clients, then you have to remember that distributed Erlang defaults to a totally-connected mesh. So all your clients are actually connected to each other, as well as the server.
When I look at your description, I imagine a chat-server, and for this I would use gen_tcp for networking instead of net_kernel.
Advantages of net_kernel:
It's very high-level. You don't need to think much about connection drops, and messages are very pure-Erlang.
It's easier to debug. You can use the rpc module from a shell to run anything on any connected node, which is cool.
Advantages of gen_tcp:
Server and clients are less connected. You could swap out a client or the server for a different version with the same network API (including swapping for something non-Erlang) and nobody else would know or care.
Clients aren't interconnected (you can also do this with hidden nodes)
You can use popular port numbers to get past dumb firewalls
I'd put your "client" and "server" modules both on the server machine. You listen for TCP connections and spawn off a "client" for each connection. The "client" module's job is to translate between the remote client talking over the network and the "server" module, talking over Erlang messages.

Simple chat system over websockets with reconnection feature

I have seen many examples of chat room systems over websocket implemented with erlang and cowboy.
Most of the examples I have seen use gproc. In practice each websocket handler registers itself with gproc and then broadcasts/receives messages from it.
Since a user could close by accident the webpage I am thinking about connecting to the websocket handler a gen_fsm which actually broadcasts/receives all the messages from gproc. In this way the gen_fsm could switch from a "connected" state to a "disconnected" state whenever the user exits and still buffer all the messages. After a while if the user is not back online the gen_fsm will terminate.
Is this a good solution? How can I make the new websocket handler to recover the gen_fsm process? Should I register the gen_fsm using the user name or is there any better solution?
What i do is the folowing :
When an user connects to the site, i swpawn a gen_server reprensenting the user. Then, the gen server registers itself in gproc as {n,l, {user, UserName}}. (It can register properties like {p,l, {chat, ChannelID}} to listen to chat channels. (see gproc pub/sub))
Ok so now the user websocket connection starts the cowboy handler (i use Bullet). The handlers asks gproc the pid() of the user's gen_server and registrers itself as a receiver of messages. So now, when the user gen_server receives messages, it redirects them to the websocket handler.
When the websocket connexion ends, the handler uregister from the user gen_server, so the user gen_server will keep messages until the next connection, or the next timeout. At the timeout, you can simply terminate the server (messages will be lost but it is ok).
See : (not tested)
-module(user_chat).
-record(state, {mailbox,receiver=undefined}).
-export([start_link/1,set_receiver/1,unset_receiver/1]).
%% API
start_link(UserID) ->
gen_server:start_link(?MODULE,[UserID],[]).
set_receiver(UserID) ->
set_receiver(UserID,self()).
unset_receiver(UserID) ->
%% Just set the receiver to undefined
set_receiver(UserID,undefined).
set_receiver(UserID, ReceiverPid) ->
UserPid = gproc:where({n,l,UserID}),
gen_server:call(UserPid,{set_receiver,ReceiverPid}).
%% Gen server internals
init([UserID]) ->
gproc:reg({n,l,{user,UserID}}),
{ok,#state{mailbox=[]}}.
handle_call({set_receiver,ReceiverPid},_From,#state{mailbox=MB}=State) ->
NewMB = check_send(MB,State),
{reply,ok,State#state{receiver=ReceiverPid,mailbox=NewMB}}.
handle_info({chat_msg,Message},#state{mailbox=MB}=State) ->
NewMB = check_send([Message|MB],State),
{noreply, State#state{mailbox=NewMB}}.
%% Mailbox empty
check_send([],_) -> [];
%% Receiver undefined, keep messages
check_send(Mailbox,#state{receiver=undefined}) -> Mailbox
%% Receiver is a pid
check_send(Mailbox,#state{receiver=Receiver}) when is_pid(Receiver) ->
%% Send all messages
Receiver ! {chat_messages,Mailbox},
%% Then return empty mailbox
[].
With the solution you propose you may have many processes pending and you will have to write a "process cleaner" for all user that never come back. Anyway it will not support a shutdown of the chat server VM, all messages stored in living FSM will vanish if the node is down.
I think that a better way should be to store all messages in a database like mnesia, with sender, receiver, expiration date... and check for any stored message at connection, and have a message cleaner process to destroy all expired messages from time to time.

Resources