Skip to content
This repository was archived by the owner on Jul 31, 2023. It is now read-only.

Commit 7468e8f

Browse files
authored
initial work on sweeper process for cleaning up orphaned spans (#118)
* initial work on sweeper process for cleaning up orphaned spans * add test suite for sweeper process * add docs about sweeper to README * add sweeper config to example project * use dialyzer nowarn_function instead of adding _ to record spec * update configto a map and add mention of infinity interval
1 parent 315ce74 commit 7468e8f

File tree

6 files changed

+274
-3
lines changed

6 files changed

+274
-3
lines changed

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,27 @@ Headers = [{oc_span_ctx_header:field_name(), EncodedSpanCtx}],
8282

8383
[DataDog][oc_datadog]: Export spans to DataDog APM
8484

85+
#### <a name="Sweeper">Cleaning Up Abandoned Spans</a> ####
86+
87+
Active spans have their data stored in an ETS table. When a span is finished it is removed from the active spans table and moved to a table handled by the reporter process. If a span isn't finished, either because of a mistake in the code creating and finishing spans, or the process with open spans crashes before being able to finish the spans, there would be a memory leak.
88+
89+
The `oc_span_sweeper` process checks for active spans which started greater than a configurable (`span_ttl`) duration, with a default of 5 minutes. There are 4 strategies for handling a span that is older than the time to live (`strategy`):
90+
91+
* `drop`: Spans are removed from the active span table and a log message is written with the total number of spans being dropped in this sweep.
92+
* `finish`: Each span is finished as is.
93+
* `failed_attribute_and_finish`: An attribute `finished_by_sweeper` with value `true` is added to the span data and then the span is finished.
94+
* Custom function: Any funtion with type spec `fun((opencensus:span()) -> ok)` can be used. Note that the span is not removed from the active spans table if this method is used and the function must do the removal if it deems it necessary.
95+
96+
An example configuration in `sys.config` to run a check every 5 minutes, dropping active spans older than 5 minutes can be found in the example project `helloworld`, `examples/helloworld/config/sys.config`, the sweeper snippet looks like:
97+
98+
``` erlang
99+
{sweeper, #{interval => 300000,
100+
strategy => drop,
101+
span_ttl => 300000}}
102+
```
103+
104+
To disable sweeping set `interval` to `infinity`.
105+
85106
### <a name="Logging">Logging</a> ###
86107

87108
OTP-21 includes a new logging framework. When a context is created with a span (for example `ocp:with_child_span/1` or `oc_trace:with_child_span/2`) opencensus will update the current process's logger metadata to include the `trace_id`, `span_id` and `trace_options` with the latest ids under the key `span_ctx`, `trace_options` will be `1` if the trace is enabled. To use these with the default formatter you can create a custom template that includes them if they exist like so:

examples/helloworld/config/sys.config

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
{opencensus, [{sampler, {oc_sampler_always, []}},
33
{reporter, {oc_reporter_stdout, []}},
44

5-
{stat, [{exporters, [{oc_stat_exporter_stdout, []}]}]}]},
5+
{stat, [{exporters, [{oc_stat_exporter_stdout, []}]}]},
6+
7+
{sweeper, #{interval => 300000,
8+
strategy => drop,
9+
span_ttl => 300000}}]},
610

711
{kernel,
812
[{logger,

src/oc_span_sweeper.erl

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
%%%------------------------------------------------------------------------
2+
%% Copyright 2018, OpenCensus Authors
3+
%% Licensed under the Apache License, Version 2.0 (the "License");
4+
%% you may not use this file except in compliance with the License.
5+
%% You may obtain a copy of the License at
6+
%%
7+
%% http://www.apache.org/licenses/LICENSE-2.0
8+
%%
9+
%% Unless required by applicable law or agreed to in writing, software
10+
%% distributed under the License is distributed on an "AS IS" BASIS,
11+
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
%% See the License for the specific language governing permissions and
13+
%% limitations under the License.
14+
%%%------------------------------------------------------------------------
15+
-module(oc_span_sweeper).
16+
17+
-behaviour(gen_statem).
18+
19+
-export([start_link/0]).
20+
21+
-export([init/1,
22+
callback_mode/0,
23+
handle_event/4,
24+
code_change/4,
25+
terminate/3]).
26+
27+
-include("opencensus.hrl").
28+
-include("oc_logger.hrl").
29+
30+
-record(data, {interval :: integer() | infinity,
31+
strategy :: drop | finish | failed_attribute_and_finish | fun((opencensus:span()) -> ok),
32+
ttl :: integer()}).
33+
34+
start_link() ->
35+
gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []).
36+
37+
init([]) ->
38+
SweeperConfig = application:get_env(opencensus, sweeper, #{}),
39+
40+
Interval = maps:get(interval, SweeperConfig, timer:minutes(5)),
41+
Strategy = maps:get(strategy, SweeperConfig, drop),
42+
TTL = maps:get(span_ttl, SweeperConfig, timer:minutes(5)),
43+
{ok, ready, #data{interval=Interval,
44+
strategy=Strategy,
45+
ttl=erlang:convert_time_unit(TTL, millisecond, native)},
46+
[hibernate, {state_timeout, Interval, sweep}]}.
47+
48+
callback_mode() ->
49+
handle_event_function.
50+
51+
handle_event(state_timeout, sweep, _, #data{interval=Interval,
52+
strategy=drop,
53+
ttl=TTL}) ->
54+
TooOld = erlang:monotonic_time() - TTL,
55+
case ets:select_delete(?SPAN_TAB, expired_match_spec(TooOld, true)) of
56+
0 ->
57+
ok;
58+
NumDeleted ->
59+
?LOG_INFO("sweep old spans: ttl=~p num_dropped=~p", [TTL, NumDeleted])
60+
end,
61+
{keep_state_and_data, [hibernate, {state_timeout, Interval, sweep}]};
62+
handle_event(state_timeout, sweep, _, #data{interval=Interval,
63+
strategy=finish,
64+
ttl=TTL}) ->
65+
Expired = select_expired(TTL),
66+
[finish_span(Span) || Span <- Expired],
67+
{keep_state_and_data, [hibernate, {state_timeout, Interval, sweep}]};
68+
handle_event(state_timeout, sweep, _, #data{interval=Interval,
69+
strategy=failed_attribute_and_finish,
70+
ttl=TTL}) ->
71+
Expired = select_expired(TTL),
72+
[finish_span(oc_span:put_attribute(<<"finished_by_sweeper">>, true, Span)) || Span <- Expired],
73+
{keep_state_and_data, [hibernate, {state_timeout, Interval, sweep}]};
74+
handle_event(state_timeout, sweep, _, #data{interval=Interval,
75+
strategy=Fun,
76+
ttl=TTL}) when is_function(Fun) ->
77+
Expired = select_expired(TTL),
78+
[Fun(Span) || Span <- Expired],
79+
{keep_state_and_data, [hibernate, {state_timeout, Interval, sweep}]};
80+
handle_event(_, _, _, _Data) ->
81+
keep_state_and_data.
82+
83+
code_change(_, State, Data, _) ->
84+
{ok, State, Data}.
85+
86+
terminate(_Reason, _State, _Data) ->
87+
ok.
88+
89+
%%
90+
91+
%% ignore these functions because dialyzer doesn't like match spec use of '_'
92+
-dialyzer({nowarn_function, expired_match_spec/2}).
93+
-dialyzer({nowarn_function, finish_span/1}).
94+
-dialyzer({nowarn_function, select_expired/1}).
95+
96+
expired_match_spec(Time, Return) ->
97+
[{#span{start_time={'$1', '_'}, _='_'},
98+
[{'<', '$1', Time}],
99+
[Return]}].
100+
101+
finish_span(S=#span{span_id=SpanId}) ->
102+
oc_span:finish_span(S),
103+
ets:delete(?SPAN_TAB, SpanId).
104+
105+
select_expired(TTL) ->
106+
TooOld = erlang:monotonic_time() - TTL,
107+
ets:select(?SPAN_TAB, expired_match_spec(TooOld, '$_')).

src/opencensus_sup.erl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@ init([]) ->
6060
type => worker,
6161
modules => [oc_server]},
6262

63+
Sweeper = #{id => oc_span_sweeper,
64+
start => {oc_span_sweeper, start_link, []},
65+
restart => permanent,
66+
shutdown => 1000,
67+
type => worker,
68+
modules => [oc_span_sweeper]},
69+
6370
{ok, {#{strategy => one_for_one,
6471
intensity => 1,
65-
period => 5}, [Reporter, Exporter, ViewServer, TraceServer]}}.
72+
period => 5}, [Reporter, Exporter, ViewServer, TraceServer, Sweeper]}}.

test/oc_std_encoder_SUITE.erl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ init_per_suite(Config) ->
2121
Config.
2222

2323
end_per_suite(_Config) ->
24-
ok = application:stop(opencensus).
24+
ok = application:stop(opencensus),
25+
application:unload(opencensus).
2526

2627
init_per_testcase(_, Config) ->
2728
Config.

test/oc_sweeper_SUITE.erl

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
2+
%%% ---------------------------------------------------------------------------
3+
%%% @doc
4+
%%% @end
5+
%%% ---------------------------------------------------------------------------
6+
-module(oc_sweeper_SUITE).
7+
8+
-compile(export_all).
9+
10+
-include_lib("eunit/include/eunit.hrl").
11+
-include_lib("common_test/include/ct.hrl").
12+
13+
-include("oc_test_utils.hrl").
14+
-include("opencensus.hrl").
15+
16+
all() ->
17+
[drop,
18+
finish,
19+
failed_attribute_and_finish].
20+
21+
init_per_suite(Config) ->
22+
ok = application:load(opencensus),
23+
Config.
24+
25+
end_per_suite(_Config) ->
26+
ok.
27+
28+
init_per_testcase(Type, Config) ->
29+
application:set_env(opencensus, sweeper, #{interval => 250,
30+
strategy => Type,
31+
span_ttl => 500}),
32+
33+
application:set_env(opencensus, send_interval_ms, 1),
34+
application:set_env(opencensus, reporter, {oc_reporter_pid, []}),
35+
application:set_env(opencensus, pid_reporter, #{pid => self()}),
36+
{ok, _} = application:ensure_all_started(opencensus),
37+
Config.
38+
39+
end_per_testcase(_, _Config) ->
40+
ok = application:stop(opencensus),
41+
ok.
42+
43+
drop(_Config) ->
44+
SpanName1 = <<"span-1">>,
45+
SpanCtx = oc_trace:start_span(SpanName1, undefined),
46+
47+
ChildSpanName1 = <<"child-span-1">>,
48+
ChildSpanCtx = oc_trace:start_span(ChildSpanName1, SpanCtx),
49+
50+
[ChildSpanData] = ets:lookup(?SPAN_TAB, ChildSpanCtx#span_ctx.span_id),
51+
?assertEqual(ChildSpanName1, ChildSpanData#span.name),
52+
?assertEqual(SpanCtx#span_ctx.span_id, ChildSpanData#span.parent_span_id),
53+
54+
oc_trace:finish_span(ChildSpanCtx),
55+
56+
%% wait until the sweeper sweeps away the parent span
57+
?UNTIL(ets:tab2list(?SPAN_TAB) =:= []),
58+
59+
oc_trace:finish_span(SpanCtx),
60+
61+
receive
62+
{span, S=#span{name=Name}} when Name =:= ChildSpanName1 ->
63+
%% Verify the end time and duration are set when the span was finished
64+
?assertMatch({ST, O} when is_integer(ST)
65+
andalso is_integer(O), S#span.start_time),
66+
?assertMatch({ST, O} when is_integer(ST)
67+
andalso is_integer(O), S#span.end_time)
68+
end,
69+
70+
%% sleep long enough that the reporter would have run again for sure
71+
timer:sleep(10),
72+
73+
%% should be no reported span for span-1
74+
?assertEqual(no_span, receive
75+
{span, #span{name=N}} when N =:= SpanName1 ->
76+
got_span
77+
after
78+
0 ->
79+
no_span
80+
end).
81+
82+
finish(_Config) ->
83+
SpanName1 = <<"span-1">>,
84+
SpanCtx = oc_trace:start_span(SpanName1, undefined),
85+
86+
ChildSpanName1 = <<"child-span-1">>,
87+
ChildSpanCtx = oc_trace:start_span(ChildSpanName1, SpanCtx),
88+
oc_trace:finish_span(ChildSpanCtx),
89+
90+
%% wait until the sweeper sweeps away the parent span
91+
?UNTIL(ets:tab2list(?SPAN_TAB) =:= []),
92+
93+
lists:foreach(fun(Name) ->
94+
receive
95+
{span, S=#span{name=Name}} ->
96+
%% Verify the end time and duration are set when the span was finished
97+
?assertMatch({ST, O} when is_integer(ST)
98+
andalso is_integer(O), S#span.start_time),
99+
?assertMatch({ST, O} when is_integer(ST)
100+
andalso is_integer(O), S#span.end_time)
101+
end
102+
end, [SpanName1, ChildSpanName1]).
103+
104+
failed_attribute_and_finish(_Config) ->
105+
SpanName1 = <<"span-1">>,
106+
SpanCtx = oc_trace:start_span(SpanName1, undefined),
107+
108+
ChildSpanName1 = <<"child-span-1">>,
109+
ChildSpanCtx = oc_trace:start_span(ChildSpanName1, SpanCtx),
110+
111+
[ChildSpanData] = ets:lookup(?SPAN_TAB, ChildSpanCtx#span_ctx.span_id),
112+
?assertEqual(ChildSpanName1, ChildSpanData#span.name),
113+
?assertEqual(SpanCtx#span_ctx.span_id, ChildSpanData#span.parent_span_id),
114+
115+
oc_trace:finish_span(ChildSpanCtx),
116+
117+
%% wait until the sweeper sweeps away the parent span
118+
?UNTIL(ets:tab2list(?SPAN_TAB) =:= []),
119+
120+
receive
121+
{span, S=#span{name=Name,
122+
attributes=Attributes}} when Name =:= SpanName1 ->
123+
%% should have attribute finished_by_sweeper
124+
?assertMatch(#{<<"finished_by_sweeper">> := true}, Attributes),
125+
126+
%% Verify the end time and duration are set when the span was finished
127+
?assertMatch({ST, O} when is_integer(ST)
128+
andalso is_integer(O), S#span.start_time),
129+
?assertMatch({ST, O} when is_integer(ST)
130+
andalso is_integer(O), S#span.end_time)
131+
end.

0 commit comments

Comments
 (0)