drivers/event/dsw/dsw_evdev.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313

/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2018 Ericsson AB
 */

#ifndef _DSW_EVDEV_H_
#define _DSW_EVDEV_H_

#include <eventdev_pmd.h>

#include <rte_event_ring.h>
#include <rte_eventdev.h>

#define DSW_PMD_NAME RTE_STR(event_dsw)

#define DSW_MAX_PORTS (64)
#define DSW_MAX_PORT_DEQUEUE_DEPTH (128)
#define DSW_MAX_PORT_ENQUEUE_DEPTH (128)
#define DSW_MAX_PORT_OUT_BUFFER (32)

#define DSW_MAX_QUEUES (16)

#define DSW_MAX_EVENTS (16384)

/* Multiple 24-bit flow ids will map to the same DSW-level flow. The
 * number of DSW flows should be high enough make it unlikely that
 * flow ids of several large flows hash to the same DSW-level flow.
 * Such collisions will limit parallelism and thus the number of cores
 * that may be utilized. However, configuring a large number of DSW
 * flows might potentially, depending on traffic and actual
 * application flow id value range, result in each such DSW-level flow
 * being very small. The effect of migrating such flows will be small,
 * in terms amount of processing load redistributed. This will in turn
 * reduce the load balancing speed, since flow migration rate has an
 * upper limit. Code changes are required to allow > 32k DSW-level
 * flows.
 */
#define DSW_MAX_FLOWS_BITS (13)
#define DSW_MAX_FLOWS (1<<(DSW_MAX_FLOWS_BITS))
#define DSW_MAX_FLOWS_MASK (DSW_MAX_FLOWS-1)

/* Eventdev RTE_SCHED_TYPE_PARALLEL doesn't have a concept of flows,
 * but the 'dsw' scheduler (more or less) randomly assign flow id to
 * events on parallel queues, to be able to reuse some of the
 * migration mechanism and scheduling logic from
 * RTE_SCHED_TYPE_ATOMIC. By moving one of the parallel "flows" from a
 * particular port, the likely-hood of events being scheduled to this
 * port is reduced, and thus a kind of statistical load balancing is
 * achieved.
 */
#define DSW_PARALLEL_FLOWS (1024)

/* 'Background tasks' are polling the control rings for *
 *  migration-related messages, or flush the output buffer (so
 *  buffered events doesn't linger too long). Shouldn't be too low,
 *  since the system won't benefit from the 'batching' effects from
 *  the output buffer, and shouldn't be too high, since it will make
 *  buffered events linger too long in case the port goes idle.
 */
#define DSW_MAX_PORT_OPS_PER_BG_TASK (128)

/* Avoid making small 'loans' from the central in-flight event credit
 * pool, to improve efficiency.
 */
#define DSW_MIN_CREDIT_LOAN (64)
#define DSW_PORT_MAX_CREDITS (2*DSW_MIN_CREDIT_LOAN)
#define DSW_PORT_MIN_CREDITS (DSW_MIN_CREDIT_LOAN)

/* The rings are dimensioned so that all in-flight events can reside
 * on any one of the port rings, to avoid the trouble of having to
 * care about the case where there's no room on the destination port's
 * input ring.
 */
#define DSW_IN_RING_SIZE (DSW_MAX_EVENTS)

#define DSW_MAX_LOAD (INT16_MAX)
#define DSW_LOAD_FROM_PERCENT(x) ((int16_t)(((x)*DSW_MAX_LOAD)/100))
#define DSW_LOAD_TO_PERCENT(x) ((100*x)/DSW_MAX_LOAD)

/* The thought behind keeping the load update interval shorter than
 * the migration interval is that the load from newly migrated flows
 * should 'show up' on the load measurement before new migrations are
 * considered. This is to avoid having too many flows, from too many
 * source ports, to be migrated too quickly to a lightly loaded port -
 * in particular since this might cause the system to oscillate.
 */
#define DSW_LOAD_UPDATE_INTERVAL (DSW_MIGRATION_INTERVAL/4)
#define DSW_OLD_LOAD_WEIGHT (1)

/* The minimum time (in us) between two flow migrations. What puts an
 * upper limit on the actual migration rate is primarily the pace in
 * which the ports send and receive control messages, which in turn is
 * largely a function of how much cycles are spent the processing of
 * an event burst.
 */
#define DSW_MIGRATION_INTERVAL (1000)
#define DSW_MIN_SOURCE_LOAD_FOR_MIGRATION (DSW_LOAD_FROM_PERCENT(70))
#define DSW_MAX_TARGET_LOAD_FOR_MIGRATION (DSW_LOAD_FROM_PERCENT(95))
#define DSW_REBALANCE_THRESHOLD (DSW_LOAD_FROM_PERCENT(3))

#define DSW_MAX_EVENTS_RECORDED (128)

#define DSW_MAX_FLOWS_PER_MIGRATION (8)

/* Only one outstanding migration per port is allowed */
#define DSW_MAX_PAUSED_FLOWS (DSW_MAX_PORTS*DSW_MAX_FLOWS_PER_MIGRATION)

/* Enough room for pause request/confirm and unpaus request/confirm for
 * all possible senders.
 */
#define DSW_CTL_IN_RING_SIZE ((DSW_MAX_PORTS-1)*4)

/* With DSW_SORT_DEQUEUED enabled, the scheduler will, at the point of
 * dequeue(), arrange events so that events with the same flow id on
 * the same queue forms a back-to-back "burst", and also so that such
 * bursts of different flow ids, but on the same queue, also come
 * consecutively. All this in an attempt to improve data and
 * instruction cache usage for the application, at the cost of a
 * scheduler overhead increase.
 */

/* #define DSW_SORT_DEQUEUED */

struct dsw_queue_flow {
	uint8_t queue_id;
	uint16_t flow_hash;
};

enum dsw_migration_state {
	DSW_MIGRATION_STATE_IDLE,
	DSW_MIGRATION_STATE_FINISH_PENDING,
	DSW_MIGRATION_STATE_PAUSING,
	DSW_MIGRATION_STATE_UNPAUSING
};

struct __rte_cache_aligned dsw_port {
	uint16_t id;

	/* Keeping a pointer here to avoid container_of() calls, which
	 * are expensive since they are very frequent and will result
	 * in an integer multiplication (since the port id is an index
	 * into the dsw_evdev port array).
	 */
	struct dsw_evdev *dsw;

	uint16_t dequeue_depth;
	uint16_t enqueue_depth;

	int32_t inflight_credits;

	int32_t new_event_threshold;

	bool implicit_release;

	uint16_t pending_releases;

	uint16_t next_parallel_flow_id;

	uint16_t ops_since_bg_task;

	/* most recent 'background' processing */
	uint64_t last_bg;

	/* For port load measurement. */
	uint64_t next_load_update;
	uint64_t load_update_interval;
	uint64_t measurement_start;
	uint64_t busy_start;
	uint64_t busy_cycles;
	uint64_t total_busy_cycles;

	/* For the ctl interface and flow migration mechanism. */
	uint64_t next_emigration;
	uint64_t migration_interval;
	enum dsw_migration_state migration_state;

	uint64_t emigration_start;
	uint64_t emigrations;
	uint64_t emigration_latency;

	uint8_t emigration_target_port_ids[DSW_MAX_FLOWS_PER_MIGRATION];
	struct dsw_queue_flow
		emigration_target_qfs[DSW_MAX_FLOWS_PER_MIGRATION];
	uint8_t emigration_targets_len;
	uint8_t cfm_cnt;

	uint64_t immigrations;

	uint16_t paused_flows_len;
	struct dsw_queue_flow paused_flows[DSW_MAX_PAUSED_FLOWS];

	/* In a very contrived worst case all inflight events can be
	 * laying around paused here.
	 */
	uint16_t paused_events_len;
	struct rte_event paused_events[DSW_MAX_EVENTS];

	uint16_t emigrating_events_len;
	/* Buffer for not-yet-processed events pertaining to a flow
	 * emigrating from this port. These events will be forwarded
	 * to the target port.
	 */
	struct rte_event emigrating_events[DSW_MAX_EVENTS];

	uint16_t seen_events_len;
	uint16_t seen_events_idx;
	struct dsw_queue_flow seen_events[DSW_MAX_EVENTS_RECORDED];

	uint64_t enqueue_calls;
	uint64_t new_enqueued;
	uint64_t forward_enqueued;
	uint64_t release_enqueued;
	uint64_t queue_enqueued[DSW_MAX_QUEUES];

	uint64_t dequeue_calls;
	uint64_t dequeued;
	uint64_t queue_dequeued[DSW_MAX_QUEUES];

	uint16_t out_buffer_len[DSW_MAX_PORTS];
	struct rte_event out_buffer[DSW_MAX_PORTS][DSW_MAX_PORT_OUT_BUFFER];

	uint16_t in_buffer_len;
	uint16_t in_buffer_start;
	/* This buffer may contain events that were read up from the
	 * in_ring during the flow migration process.
	 */
	struct rte_event in_buffer[DSW_MAX_EVENTS];

	alignas(RTE_CACHE_LINE_SIZE) struct rte_event_ring *in_ring;

	alignas(RTE_CACHE_LINE_SIZE) struct rte_ring *ctl_in_ring;

	/* Estimate of current port load. */
	alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(int16_t) load;
	/* Estimate of flows currently migrating to this port. */
	alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(int32_t) immigration_load;
};

struct dsw_queue {
	uint8_t schedule_type;
	uint64_t serving_ports;
	uint16_t num_serving_ports;

	alignas(RTE_CACHE_LINE_SIZE) uint8_t flow_to_port_map[DSW_MAX_FLOWS];
};

/* Limited by the size of the 'serving_ports' bitmask */
static_assert(DSW_MAX_PORTS <= 64, "Max compile-time port count exceeded");

struct dsw_evdev {
	struct rte_eventdev_data *data;

	struct dsw_port ports[DSW_MAX_PORTS];
	uint16_t num_ports;
	struct dsw_queue queues[DSW_MAX_QUEUES];
	uint8_t num_queues;
	int32_t max_inflight;

	alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(int32_t) credits_on_loan;
};

#define DSW_CTL_PAUSE_REQ (0)
#define DSW_CTL_UNPAUSE_REQ (1)
#define DSW_CTL_CFM (2)

struct __rte_aligned(4) dsw_ctl_msg {
	uint8_t type;
	uint8_t originating_port_id;
	uint8_t qfs_len;
	struct dsw_queue_flow qfs[DSW_MAX_FLOWS_PER_MIGRATION];
};

uint16_t dsw_event_enqueue(void *port, const struct rte_event *event);
uint16_t dsw_event_enqueue_burst(void *port,
				 const struct rte_event events[],
				 uint16_t events_len);
uint16_t dsw_event_enqueue_new_burst(void *port,
				     const struct rte_event events[],
				     uint16_t events_len);
uint16_t dsw_event_enqueue_forward_burst(void *port,
					 const struct rte_event events[],
					 uint16_t events_len);

uint16_t dsw_event_dequeue(void *port, struct rte_event *ev, uint64_t wait);
uint16_t dsw_event_dequeue_burst(void *port, struct rte_event *events,
				 uint16_t num, uint64_t wait);
void dsw_event_maintain(void *port, int op);

int dsw_xstats_get_names(const struct rte_eventdev *dev,
			 enum rte_event_dev_xstats_mode mode,
			 uint8_t queue_port_id,
			 struct rte_event_dev_xstats_name *xstats_names,
			 uint64_t *ids, unsigned int size);
int dsw_xstats_get(const struct rte_eventdev *dev,
		   enum rte_event_dev_xstats_mode mode, uint8_t queue_port_id,
		   const uint64_t ids[], uint64_t values[], unsigned int n);
uint64_t dsw_xstats_get_by_name(const struct rte_eventdev *dev,
				const char *name, uint64_t *id);

static inline struct dsw_evdev *
dsw_pmd_priv(const struct rte_eventdev *eventdev)
{
	return eventdev->data->dev_private;
}

#define DSW_LOG_DP(level, fmt, args...)					\
	RTE_LOG_DP(level, EVENTDEV, "[%s] %s() line %u: " fmt,		\
		   DSW_PMD_NAME,					\
		   __func__, __LINE__, ## args)

#define DSW_LOG_DP_PORT(level, port_id, fmt, args...)		\
	DSW_LOG_DP(level, "<Port %d> " fmt, port_id, ## args)

#endif