From 8786df05258e069192758d2524ef6ce98bac520b Mon Sep 17 00:00:00 2001 From: anlicheng <244108715@qq.com> Date: Wed, 21 May 2025 20:05:34 +0800 Subject: [PATCH] change efka_agent to gen_statem --- README.md | 2 + apps/efka/src/efka_agent2.erl | 399 ++++++++++++++++++++++++++++++++++ 2 files changed, 401 insertions(+) create mode 100644 apps/efka/src/efka_agent2.erl diff --git a/README.md b/README.md index 1dbf49f..29d3de6 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ efka An OTP application 1. 先解决数据的上行问题 +2. todo list + 要解决连接断开重新连接的问题 !!! Build ----- diff --git a/apps/efka/src/efka_agent2.erl b/apps/efka/src/efka_agent2.erl new file mode 100644 index 0000000..7363564 --- /dev/null +++ b/apps/efka/src/efka_agent2.erl @@ -0,0 +1,399 @@ +%%%------------------------------------------------------------------- +%%% @author anlicheng +%%% @copyright (C) 2025, +%%% @doc +%%% +%%% @end +%%% Created : 21. 5月 2025 18:38 +%%%------------------------------------------------------------------- +-module(efka_agent2). +-author("anlicheng"). +-include("message_pb.hrl"). +-include("efka.hrl"). +-include("efka_tables.hrl"). + +-behaviour(gen_statem). + +%% API +-export([start_link/0]). +-export([metric_data/3, event/3, ping/13, request_service_config/2, await_reply/2]). + +%% gen_statem callbacks +-export([init/1, handle_event/4, terminate/3, code_change/4, callback_mode/0]). + +-define(SERVER, ?MODULE). + +%% 标记当前agent的状态,只有在 activated 状态下才可以正常的发送数据 +-define(STATE_DENIED, denied). +-define(STATE_CONNECTING, connecting). +-define(STATE_AUTH, auth). +%% 不能推送消息到服务,但是可以接受服务器的部分指令 +-define(STATE_RESTRICTED, restricted). +%% 激活状态下 +-define(STATE_ACTIVATED, activated). + +-record(state, { + transport_pid :: undefined | pid(), + %% 映射关系 #{Ref => PacketId} + inflight = #{} +}). + +%%%=================================================================== +%%% API +%%%=================================================================== + +%% 发送数据 +-spec metric_data(ServiceId :: binary(), DeviceUUID::binary(), LineProtocolData :: binary()) -> no_return(). +metric_data(ServiceId, DeviceUUID, LineProtocolData) when is_binary(ServiceId), is_binary(DeviceUUID), is_binary(LineProtocolData) -> + gen_statem:cast(?SERVER, {metric_data, ServiceId, DeviceUUID, LineProtocolData}). + +-spec event(ServiceId :: binary(), EventType :: integer(), Params :: binary()) -> no_return(). +event(ServiceId, EventType, Params) when is_binary(ServiceId), is_integer(EventType), is_binary(Params) -> + gen_statem:cast(?SERVER, {event, ServiceId, EventType, Params}). + +ping(AdCode, BootTime, Province, City, EfkaVersion, KernelArch, Ips, CpuCore, CpuLoad, CpuTemperature, Disk, Memory, Interfaces) -> + gen_statem:cast(?SERVER, {ping, AdCode, BootTime, Province, City, EfkaVersion, KernelArch, Ips, CpuCore, CpuLoad, CpuTemperature, Disk, Memory, Interfaces}). + +%% 请求微服务的配置 +-spec request_service_config(ReceiverPid :: pid(), ServiceId :: binary()) -> {ok, Ref :: reference()} | {error, Reason :: term()}. +request_service_config(ReceiverPid, ServiceId) when is_binary(ServiceId) -> + gen_statem:call(?SERVER, {request_service_config, ReceiverPid, ServiceId}). + +%% 等待消息的回复 +-spec await_reply(Ref :: reference(), Timeout :: timeout()) -> {ok, Reply :: binary()} | {error, timeout}. +await_reply(Ref, Timeout) when is_reference(Ref), is_integer(Timeout) -> + receive + {transport_reply, Ref, ReplyBin} -> + {ok, ReplyBin} + after Timeout -> + {error, timeout} + end. + +%% @doc Creates a gen_statem process which calls Module:init/1 to +%% initialize. To ensure a synchronized start-up procedure, this +%% function does not return until Module:init/1 has returned. +start_link() -> + gen_statem:start_link({local, ?SERVER}, ?MODULE, [], []). + +%%%=================================================================== +%%% gen_statem callbacks +%%%=================================================================== + +%% @private +%% @doc Whenever a gen_statem is started using gen_statem:start/[3,4] or +%% gen_statem:start_link/[3,4], this function is called by the new +%% process to initialize. +init([]) -> + erlang:process_flag(trap_exit, true), + erlang:start_timer(0, self(), create_transport), + {ok, ?STATE_DENIED, #state{}}. + +%% @private +%% @doc This function is called by a gen_statem when it needs to find out +%% the callback mode of the callback module. +callback_mode() -> + handle_event_function. + +%% @private +%% @doc If callback_mode is handle_event_function, then whenever a +%% gen_statem receives an event from call/2, cast/2, or as a normal +%% process message, this function is called. +handle_event(cast, {metric_data, ServiceId, DeviceUUID, LineProtocolData}, ?STATE_ACTIVATED, State = #state{transport_pid = TransportPid}) -> + Packet = message_pb:encode_msg(#data{ + service_id = ServiceId, + device_uuid = DeviceUUID, + metric = LineProtocolData + }), + efka_transport:send(TransportPid, ?METHOD_DATA, Packet), + {keep_state, State}; +handle_event(cast, {metric_data, ServiceId, DeviceUUID, LineProtocolData}, _, State) -> + Packet = message_pb:encode_msg(#data{ + service_id = ServiceId, + device_uuid = DeviceUUID, + metric = LineProtocolData + }), + ok = cache_model:insert(?METHOD_DATA, Packet), + {keep_state, State}; + +handle_event(cast, {event, ServiceId, EventType, Params}, ?STATE_ACTIVATED, State = #state{transport_pid = TransportPid}) -> + EventPacket = message_pb:encode_msg(#event{ + service_id = ServiceId, + event_type = EventType, + params = Params + }), + efka_transport:send(TransportPid, ?METHOD_EVENT, EventPacket), + {keep_state, State}; +handle_event(cast, {event, ServiceId, EventType, Params}, ?STATE_ACTIVATED, State) -> + EventPacket = message_pb:encode_msg(#event{ + service_id = ServiceId, + event_type = EventType, + params = Params + }), + ok = cache_model:insert(?METHOD_EVENT, EventPacket), + {keep_state, State}; + +handle_event(cast, {ping, AdCode, BootTime, Province, City, EfkaVersion, KernelArch, Ips, CpuCore, CpuLoad, CpuTemperature, Disk, Memory, Interfaces}, ?STATE_ACTIVATED, + State = #state{transport_pid = TransportPid}) -> + + Ping = message_pb:encode_msg(#ping{ + adcode = AdCode, + boot_time = BootTime, + province = Province, + city = City, + efka_version = EfkaVersion, + kernel_arch = KernelArch, + ips = Ips, + cpu_core = CpuCore, + cpu_load = CpuLoad, + cpu_temperature = CpuTemperature, + disk = Disk, + memory = Memory, + interfaces = Interfaces + }), + efka_transport:send(TransportPid, ?METHOD_PING, Ping), + {keep_state, State}; + +handle_event(info, {timeout, _, create_transport}, ?STATE_ACTIVATED, State) -> + {ok, Props} = application:get_env(efka, tls_server), + Host = proplists:get_value(host, Props), + Port = proplists:get_value(port, Props), + {ok, TransportPid} = efka_transport:start_link(self(), Host, Port), + efka_transport:connect(TransportPid), + + {next_state, ?STATE_CONNECTING, State#state{transport_pid = TransportPid}}; + +handle_event(info, {connect_reply, Reply}, ?STATE_CONNECTING, State = #state{transport_pid = TransportPid}) -> + case Reply of + ok -> + AuthBin = auth_request(), + efka_transport:auth_request(TransportPid, AuthBin), + {next_state, ?STATE_AUTH, State}; + {error, Reason} -> + lager:debug("[efka_agent] connect failed, error: ~p, pid: ~p", [Reason, TransportPid]), + efka_transport:stop(TransportPid), + {next_state, ?STATE_DENIED, State} + end; + +handle_event(info, {auth_reply, Reply}, ?STATE_AUTH, State = #state{transport_pid = TransportPid}) -> + case Reply of + {ok, ReplyBin} -> + #auth_reply{code = Code, message = Message} = message_pb:decode_msg(ReplyBin, auth_reply), + case Code of + 0 -> + lager:debug("[efka_agent] auth success, message: ~p", [Message]), + %% 上传缓冲区里面的所有数据 + CacheItems = cache_model:get_all_cache(), + lists:foreach(fun(#cache{id = Id, method = Method, data = Packet}) -> + efka_transport:send(TransportPid, Method, Packet), + cache_model:delete(Id) + end, CacheItems), + {next_state, ?STATE_ACTIVATED, State}; + 1 -> + %% 主机在后台的授权未通过;此时agent不能推送数据给云端服务器,但是云端服务器可以推送命令给agent + %% socket的连接状态需要维持 + lager:debug("[efka_agent] auth denied, message: ~p", [Message]), + {next_state, ?STATE_RESTRICTED, State}; + 2 -> + % 其他类型的错误,需要间隔时间重试 + lager:debug("[efka_agent] auth failed, message: ~p", [Message]), + efka_transport:stop(TransportPid), + {next_state, ?STATE_DENIED, State#state{transport_pid = undefined}}; + _ -> + % 其他类型的错误,需要间隔时间重试 + lager:debug("[efka_agent] auth failed, invalid message"), + efka_transport:stop(TransportPid), + {next_state, ?STATE_DENIED, State#state{transport_pid = undefined}} + end; + {error, Reason} -> + lager:debug("[efka_agent] auth_request failed, error: ~p", [Reason]), + efka_transport:stop(TransportPid), + {next_state, ?STATE_DENIED, State#state{transport_pid = undefined}} + end; + +%% 云端服务器推送了消息 +%% 激活消息 + +%% 微服务部署 +handle_event(info, {server_push, PacketId, <>}, ?STATE_ACTIVATED, State = #state{transport_pid = TransportPid}) -> + #deploy{task_id = TaskId, service_id = ServiceId, tar_url = TarUrl} = message_pb:decode_msg(DeployBin, deploy), + + %% 短暂的等待,efka_inetd收到消息后就立即返回了 + Reply = case efka_inetd:deploy(TaskId, ServiceId, TarUrl) of + ok -> + #async_call_reply{code = 1, result = <<"ok">>}; + {error, Reason} when is_binary(Reason) -> + #async_call_reply{code = 0, message = Reason} + end, + efka_transport:async_call_reply(TransportPid, PacketId, message_pb:encode_msg(Reply)), + + {keep_state, State}; + +%% 启动微服务 +handle_event(info, {server_push, PacketId, <>}, ?STATE_ACTIVATED, State = #state{transport_pid = TransportPid}) -> + %% 短暂的等待,efka_inetd收到消息后就立即返回了 + Reply = case efka_inetd:start_service(ServiceId) of + ok -> + #async_call_reply{code = 1, result = <<"ok">>}; + {error, Reason} when is_binary(Reason) -> + #async_call_reply{code = 0, message = Reason} + end, + efka_transport:async_call_reply(TransportPid, PacketId, message_pb:encode_msg(Reply)), + + {keep_state, State}; + +%% 停止微服务 +handle_event(info, {server_push, PacketId, <>}, ?STATE_ACTIVATED, State = #state{transport_pid = TransportPid}) -> + %% 短暂的等待,efka_inetd收到消息后就立即返回了 + Reply = case efka_inetd:stop_service(ServiceId) of + ok -> + #async_call_reply{code = 1, result = <<"ok">>}; + {error, Reason} when is_binary(Reason) -> + #async_call_reply{code = 0, message = Reason} + end, + efka_transport:async_call_reply(TransportPid, PacketId, message_pb:encode_msg(Reply)), + + {keep_state, State}; + +%% config.json配置信息 +handle_event(info, {server_push, PacketId, <>}, ?STATE_ACTIVATED, State = #state{transport_pid = TransportPid, inflight = Inflight}) -> + #push_service_config{service_id = ServiceId, config_json = ConfigJson, timeout = Timeout} = message_pb:decode_msg(ConfigBin, push_service_config), + + case efka_service:get_pid(ServiceId) of + undefined -> + Reply = #async_call_reply{code = 0, message = <<"service not run">>}, + efka_transport:async_call_reply(TransportPid, PacketId, message_pb:encode_msg(Reply)), + {keep_state, State}; + ServicePid when is_pid(ServicePid) -> + Ref = make_ref(), + %% 将配置文件推送到对应的微服务 + efka_service:push_config(ServicePid, Ref, ConfigJson), + %% 处理超时逻辑 + erlang:start_timer(Timeout, self(), {request_timeout, Ref}), + + {keep_state, State#state{inflight = maps:put(Ref, PacketId, Inflight)}} + end; + +%% 收到需要回复的指令 +handle_event(info, {server_push, PacketId, <>}, ?STATE_ACTIVATED, State = #state{inflight = Inflight, transport_pid = TransportPid}) -> + #invoke{service_id = ServiceId, payload = Payload, timeout = Timeout} = message_pb:decode_msg(InvokeBin, invoke), + %% 消息发送到订阅系统 + case efka_service:get_pid(ServiceId) of + undefined -> + Reply = #async_call_reply{code = 0, message = <<"micro_service not run">>}, + efka_transport:async_call_reply(TransportPid, PacketId, message_pb:encode_msg(Reply)), + + {keep_state, State}; + ServicePid when is_pid(ServicePid) -> + Ref = make_ref(), + efka_service:invoke(ServicePid, Ref, Payload), + %% 处理超时逻辑 + erlang:start_timer(Timeout, self(), {request_timeout, Ref}), + + {keep_state, State#state{inflight = maps:put(Ref, PacketId, Inflight)}} + end; + +%% 处理task_log +handle_event(info, {server_push, PacketId, <>}, ?STATE_ACTIVATED, State = #state{transport_pid = TransportPid}) -> + #fetch_task_log{task_id = TaskId} = message_pb:decode_msg(TaskLogBin, fetch_task_log), + lager:debug("[efka_agent] get task_log request: ~p", [TaskId]), + {ok, Logs} = efka_inetd_task_log:get_logs(TaskId), + Reply = case length(Logs) > 0 of + true -> + Result = iolist_to_binary(jiffy:encode(Logs, [force_utf8])), + #async_call_reply{code = 1, result = Result}; + false -> + #async_call_reply{code = 1, result = <<"[]">>} + end, + efka_transport:async_call_reply(TransportPid, PacketId, message_pb:encode_msg(Reply)), + + {keep_state, State}; + +%% 处理命令 +handle_event(info, {server_command, ?COMMAND_AUTH, <>}, StateName, State = #state{transport_pid = TransportPid}) -> + case {Auth, StateName} of + {1, ?STATE_ACTIVATED} -> + {keep_state, State}; + {1, ?STATE_DENIED} -> + %% 重新激活, 需要重新校验 + AuthRequestBin = auth_request(), + efka_transport:auth_request(TransportPid, AuthRequestBin), + {next_state, ?STATE_AUTH, State}; + {0, _} -> + %% 这个时候的主机应该是受限制的状态,不允许发送消息;但是能够接受服务器推送的消息 + {next_state, ?STATE_RESTRICTED, State} + end; + +%% 收到需要回复的指令 +handle_event(info, {server_pub, Topic, Content}, ?STATE_ACTIVATED, State) -> + lager:debug("[efka_agent] get pub topic: ~p, content: ~p", [Topic, Content]), + %% 消息发送到订阅系统 + efka_subscription:publish(Topic, Content), + {keep_state, State}; + +%% 收到来自efka_service的回复 +handle_event(info, {service_reply, Ref, EmsReply}, ?STATE_ACTIVATED, State = #state{inflight = Inflight, transport_pid = TransportPid}) -> + case maps:take(Ref, Inflight) of + error -> + {keep_state, State}; + {PacketId, NInflight} -> + Reply = case EmsReply of + {ok, Result} -> + #async_call_reply{code = 1, result = Result}; + {error, Reason} -> + #async_call_reply{code = 0, message = Reason} + end, + efka_transport:async_call_reply(TransportPid, PacketId, message_pb:encode_msg(Reply)), + + {keep_state, State#state{inflight = NInflight}} + end; + +%% todo 请求超时逻辑处理 +handle_event(info, {timeout, _, {request_timeout, Ref}}, ?STATE_ACTIVATED, State = #state{inflight = Inflight, transport_pid = TransportPid}) -> + case maps:take(Ref, Inflight) of + error -> + {keep_state, State}; + {PacketId, NInflight} -> + Reply = #async_call_reply{code = 0, message = <<"reqeust timeout">>, result = <<>>}, + efka_transport:async_call_reply(TransportPid, PacketId, message_pb:encode_msg(Reply)), + + {keep_state, State#state{inflight = NInflight}} + end; + +%% transport进程退出 +handle_event(info, {'EXIT', TransportPid, Reason}, _, State = #state{transport_pid = TransportPid}) -> + lager:debug("[efka_agent] transport pid: ~p, exit with reason: ~p", [TransportPid, Reason]), + erlang:start_timer(5000, self(), create_transport), + {next_state, ?STATE_DENIED, State#state{transport_pid = undefined}}. + +%% @private +%% @doc This function is called by a gen_statem when it is about to +%% terminate. It should be the opposite of Module:init/1 and do any +%% necessary cleaning up. When it returns, the gen_statem terminates with +%% Reason. The return value is ignored. +terminate(_Reason, _StateName, _State = #state{}) -> + ok. + +%% @private +%% @doc Convert process state when code is changed +code_change(_OldVsn, StateName, State = #state{}, _Extra) -> + {ok, StateName, State}. + +%%%=================================================================== +%%% Internal functions +%%%=================================================================== + +-spec auth_request() -> binary(). +auth_request() -> + {ok, AuthInfo} = application:get_env(efka, auth), + UUID = proplists:get_value(uuid, AuthInfo), + Username = proplists:get_value(username, AuthInfo), + Salt = proplists:get_value(salt, AuthInfo), + Token = proplists:get_value(token, AuthInfo), + + message_pb:encode_msg(#auth_request{ + uuid = unicode:characters_to_binary(UUID), + username = unicode:characters_to_binary(Username), + salt = unicode:characters_to_binary(Salt), + token = unicode:characters_to_binary(Token), + timestamp = efka_util:timestamp() + }). \ No newline at end of file