summaryrefslogtreecommitdiff
path: root/lib/librte_power
diff options
context:
space:
mode:
Diffstat (limited to 'lib/librte_power')
-rw-r--r--lib/librte_power/Makefile2
-rw-r--r--lib/librte_power/meson.build3
-rw-r--r--lib/librte_power/power_pstate_cpufreq.c771
-rw-r--r--lib/librte_power/power_pstate_cpufreq.h218
-rw-r--r--lib/librte_power/rte_power.c48
-rw-r--r--lib/librte_power/rte_power.h3
6 files changed, 1033 insertions, 12 deletions
diff --git a/lib/librte_power/Makefile b/lib/librte_power/Makefile
index 9bec668ddc..ab771528fe 100644
--- a/lib/librte_power/Makefile
+++ b/lib/librte_power/Makefile
@@ -6,6 +6,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
# library name
LIB = librte_power.a
+CFLAGS += -DALLOW_EXPERIMENTAL_API
CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
LDLIBS += -lrte_eal -lrte_timer
@@ -17,6 +18,7 @@ LIBABIVER := 1
SRCS-$(CONFIG_RTE_LIBRTE_POWER) := rte_power.c power_acpi_cpufreq.c
SRCS-$(CONFIG_RTE_LIBRTE_POWER) += power_kvm_vm.c guest_channel.c
SRCS-$(CONFIG_RTE_LIBRTE_POWER) += rte_power_empty_poll.c
+SRCS-$(CONFIG_RTE_LIBRTE_POWER) += power_pstate_cpufreq.c
# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_POWER)-include := rte_power.h rte_power_empty_poll.h
diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
index 9ed8b56da4..02e0337ced 100644
--- a/lib/librte_power/meson.build
+++ b/lib/librte_power/meson.build
@@ -6,6 +6,7 @@ if host_machine.system() != 'linux'
endif
sources = files('rte_power.c', 'power_acpi_cpufreq.c',
'power_kvm_vm.c', 'guest_channel.c',
- 'rte_power_empty_poll.c')
+ 'rte_power_empty_poll.c',
+ 'power_pstate_cpufreq.c')
headers = files('rte_power.h','rte_power_empty_poll.h')
deps += ['timer']
diff --git a/lib/librte_power/power_pstate_cpufreq.c b/lib/librte_power/power_pstate_cpufreq.c
new file mode 100644
index 0000000000..411d0eb163
--- /dev/null
+++ b/lib/librte_power/power_pstate_cpufreq.c
@@ -0,0 +1,771 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <limits.h>
+#include <errno.h>
+#include <inttypes.h>
+
+#include <rte_memcpy.h>
+#include <rte_atomic.h>
+
+#include "power_pstate_cpufreq.h"
+#include "power_common.h"
+
+
+#ifdef RTE_LIBRTE_POWER_DEBUG
+#define POWER_DEBUG_TRACE(fmt, args...) do { \
+ RTE_LOG(ERR, POWER, "%s: " fmt, __func__, ## args); \
+} while (0)
+#else
+#define POWER_DEBUG_TRACE(fmt, args...)
+#endif
+
+#define FOPEN_OR_ERR_RET(f, retval) do { \
+ if ((f) == NULL) { \
+ RTE_LOG(ERR, POWER, "File not openned\n"); \
+ return retval; \
+ } \
+} while (0)
+
+#define FOPS_OR_NULL_GOTO(ret, label) do { \
+ if ((ret) == NULL) { \
+ RTE_LOG(ERR, POWER, "fgets returns nothing\n"); \
+ goto label; \
+ } \
+} while (0)
+
+#define FOPS_OR_ERR_GOTO(ret, label) do { \
+ if ((ret) < 0) { \
+ RTE_LOG(ERR, POWER, "File operations failed\n"); \
+ goto label; \
+ } \
+} while (0)
+
+
+#define POWER_CONVERT_TO_DECIMAL 10
+#define BUS_FREQ 100000
+
+#define POWER_GOVERNOR_PERF "performance"
+#define POWER_SYSFILE_GOVERNOR \
+ "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor"
+#define POWER_SYSFILE_MAX_FREQ \
+ "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq"
+#define POWER_SYSFILE_MIN_FREQ \
+ "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_min_freq"
+#define POWER_SYSFILE_CUR_FREQ \
+ "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq"
+#define POWER_SYSFILE_BASE_MAX_FREQ \
+ "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_max_freq"
+#define POWER_SYSFILE_BASE_MIN_FREQ \
+ "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
+#define POWER_MSR_PATH "/dev/cpu/%u/msr"
+
+/*
+ * MSR related
+ */
+#define PLATFORM_INFO 0x0CE
+#define NON_TURBO_MASK 0xFF00
+#define NON_TURBO_OFFSET 0x8
+
+
+enum power_state {
+ POWER_IDLE = 0,
+ POWER_ONGOING,
+ POWER_USED,
+ POWER_UNKNOWN
+};
+
+struct pstate_power_info {
+ unsigned int lcore_id; /**< Logical core id */
+ uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
+ uint32_t nb_freqs; /**< number of available freqs */
+ FILE *f_cur_min; /**< FD of scaling_min */
+ FILE *f_cur_max; /**< FD of scaling_max */
+ char governor_ori[32]; /**< Original governor name */
+ uint32_t curr_idx; /**< Freq index in freqs array */
+ uint32_t non_turbo_max_ratio; /**< Non Turbo Max ratio */
+ uint32_t sys_max_freq; /**< system wide max freq */
+ volatile uint32_t state; /**< Power in use state */
+ uint16_t turbo_available; /**< Turbo Boost available */
+ uint16_t turbo_enable; /**< Turbo Boost enable/disable */
+} __rte_cache_aligned;
+
+
+static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
+
+/**
+ * It is to read the specific MSR.
+ */
+
+static int32_t
+power_rdmsr(int msr, uint64_t *val, unsigned int lcore_id)
+{
+ int fd, ret;
+ char fullpath[PATH_MAX];
+
+ snprintf(fullpath, sizeof(fullpath), POWER_MSR_PATH, lcore_id);
+
+ fd = open(fullpath, O_RDONLY);
+
+ if (fd < 0) {
+ RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", fullpath,
+ strerror(errno));
+ return fd;
+ }
+
+ ret = pread(fd, val, sizeof(uint64_t), msr);
+
+ if (ret < 0) {
+ RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", fullpath,
+ strerror(errno));
+ goto out;
+ }
+
+ POWER_DEBUG_TRACE("MSR Path %s, offset 0x%X for lcore %u\n",
+ fullpath, msr, lcore_id);
+
+ POWER_DEBUG_TRACE("Ret value %d, content is 0x%"PRIx64"\n", ret, *val);
+
+out: close(fd);
+ return ret;
+}
+
+/**
+ * It is to fopen the sys file for the future setting the lcore frequency.
+ */
+static int
+power_init_for_setting_freq(struct pstate_power_info *pi)
+{
+ FILE *f_min, *f_max;
+ char fullpath_min[PATH_MAX];
+ char fullpath_max[PATH_MAX];
+ uint64_t max_non_turbo = 0;
+
+ snprintf(fullpath_min, sizeof(fullpath_min), POWER_SYSFILE_MIN_FREQ,
+ pi->lcore_id);
+
+ f_min = fopen(fullpath_min, "rw+");
+ FOPEN_OR_ERR_RET(f_min, -1);
+
+ snprintf(fullpath_max, sizeof(fullpath_max), POWER_SYSFILE_MAX_FREQ,
+ pi->lcore_id);
+
+ f_max = fopen(fullpath_max, "rw+");
+ FOPEN_OR_ERR_RET(f_max, -1);
+
+ pi->f_cur_min = f_min;
+ pi->f_cur_max = f_max;
+
+ /* Add MSR read to detect turbo status */
+
+ if (power_rdmsr(PLATFORM_INFO, &max_non_turbo, pi->lcore_id) < 0)
+ return -1;
+
+ max_non_turbo = (max_non_turbo&NON_TURBO_MASK)>>NON_TURBO_OFFSET;
+
+ POWER_DEBUG_TRACE("no turbo perf %"PRIu64"\n", max_non_turbo);
+
+ pi->non_turbo_max_ratio = max_non_turbo;
+
+ return 0;
+}
+
+static int
+set_freq_internal(struct pstate_power_info *pi, uint32_t idx)
+{
+ uint32_t target_freq = 0;
+
+ if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) {
+ RTE_LOG(ERR, POWER, "Invalid frequency index %u, which "
+ "should be less than %u\n", idx, pi->nb_freqs);
+ return -1;
+ }
+
+ /* Check if it is the same as current */
+ if (idx == pi->curr_idx)
+ return 0;
+
+ /* Because Intel Pstate Driver only allow user change min/max hint
+ * User need change the min/max as same value.
+ */
+ if (fseek(pi->f_cur_min, 0, SEEK_SET) < 0) {
+ RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
+ "for setting frequency for lcore %u\n",
+ pi->lcore_id);
+ return -1;
+ }
+
+ if (fseek(pi->f_cur_max, 0, SEEK_SET) < 0) {
+ RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
+ "for setting frequency for lcore %u\n",
+ pi->lcore_id);
+ return -1;
+ }
+
+ /* Turbo is available and enabled, first freq bucket is sys max freq */
+ if (pi->turbo_available && pi->turbo_enable && (idx == 0))
+ target_freq = pi->sys_max_freq;
+ else
+ target_freq = pi->freqs[idx];
+
+ /* Decrease freq, the min freq should be updated first */
+ if (idx > pi->curr_idx) {
+
+ if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
+ RTE_LOG(ERR, POWER, "Fail to write new frequency for "
+ "lcore %u\n", pi->lcore_id);
+ return -1;
+ }
+
+ if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
+ RTE_LOG(ERR, POWER, "Fail to write new frequency for "
+ "lcore %u\n", pi->lcore_id);
+ return -1;
+ }
+
+ POWER_DEBUG_TRACE("Freqency '%u' to be set for lcore %u\n",
+ target_freq, pi->lcore_id);
+
+ fflush(pi->f_cur_min);
+ fflush(pi->f_cur_max);
+
+ }
+
+ /* Increase freq, the max freq should be updated first */
+ if (idx < pi->curr_idx) {
+
+ if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
+ RTE_LOG(ERR, POWER, "Fail to write new frequency for "
+ "lcore %u\n", pi->lcore_id);
+ return -1;
+ }
+
+ if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
+ RTE_LOG(ERR, POWER, "Fail to write new frequency for "
+ "lcore %u\n", pi->lcore_id);
+ return -1;
+ }
+
+ POWER_DEBUG_TRACE("Freqency '%u' to be set for lcore %u\n",
+ target_freq, pi->lcore_id);
+
+ fflush(pi->f_cur_max);
+ fflush(pi->f_cur_min);
+ }
+
+ pi->curr_idx = idx;
+
+ return 1;
+}
+
+/**
+ * It is to check the current scaling governor by reading sys file, and then
+ * set it into 'performance' if it is not by writing the sys file. The original
+ * governor will be saved for rolling back.
+ */
+static int
+power_set_governor_performance(struct pstate_power_info *pi)
+{
+ FILE *f;
+ int ret = -1;
+ char buf[BUFSIZ];
+ char fullpath[PATH_MAX];
+ char *s;
+ int val;
+
+ snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR,
+ pi->lcore_id);
+ f = fopen(fullpath, "rw+");
+ FOPEN_OR_ERR_RET(f, ret);
+
+ s = fgets(buf, sizeof(buf), f);
+ FOPS_OR_NULL_GOTO(s, out);
+
+ /* Check if current governor is performance */
+ if (strncmp(buf, POWER_GOVERNOR_PERF,
+ sizeof(POWER_GOVERNOR_PERF)) == 0) {
+ ret = 0;
+ POWER_DEBUG_TRACE("Power management governor of lcore %u is "
+ "already performance\n", pi->lcore_id);
+ goto out;
+ }
+ /* Save the original governor */
+ snprintf(pi->governor_ori, sizeof(pi->governor_ori), "%s", buf);
+
+ /* Write 'performance' to the governor */
+ val = fseek(f, 0, SEEK_SET);
+ FOPS_OR_ERR_GOTO(val, out);
+
+ val = fputs(POWER_GOVERNOR_PERF, f);
+ FOPS_OR_ERR_GOTO(val, out);
+
+ ret = 0;
+ RTE_LOG(INFO, POWER, "Power management governor of lcore %u has been "
+ "set to performance successfully\n", pi->lcore_id);
+out:
+ fclose(f);
+
+ return ret;
+}
+
+/**
+ * It is to check the governor and then set the original governor back if
+ * needed by writing the sys file.
+ */
+static int
+power_set_governor_original(struct pstate_power_info *pi)
+{
+ FILE *f;
+ int ret = -1;
+ char buf[BUFSIZ];
+ char fullpath[PATH_MAX];
+ char *s;
+ int val;
+
+ snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR,
+ pi->lcore_id);
+ f = fopen(fullpath, "rw+");
+ FOPEN_OR_ERR_RET(f, ret);
+
+ s = fgets(buf, sizeof(buf), f);
+ FOPS_OR_NULL_GOTO(s, out);
+
+ /* Check if the governor to be set is the same as current */
+ if (strncmp(buf, pi->governor_ori, sizeof(pi->governor_ori)) == 0) {
+ ret = 0;
+ POWER_DEBUG_TRACE("Power management governor of lcore %u "
+ "has already been set to %s\n",
+ pi->lcore_id, pi->governor_ori);
+ goto out;
+ }
+
+ /* Write back the original governor */
+ val = fseek(f, 0, SEEK_SET);
+ FOPS_OR_ERR_GOTO(val, out);
+
+ val = fputs(pi->governor_ori, f);
+ FOPS_OR_ERR_GOTO(val, out);
+
+ ret = 0;
+ RTE_LOG(INFO, POWER, "Power management governor of lcore %u "
+ "has been set back to %s successfully\n",
+ pi->lcore_id, pi->governor_ori);
+out:
+ fclose(f);
+
+ return ret;
+}
+
+/**
+ * It is to get the available frequencies of the specific lcore by reading the
+ * sys file.
+ */
+static int
+power_get_available_freqs(struct pstate_power_info *pi)
+{
+ FILE *f_min, *f_max;
+ int ret = -1;
+ char *p_min, *p_max;
+ char buf_min[BUFSIZ];
+ char buf_max[BUFSIZ];
+ char fullpath_min[PATH_MAX];
+ char fullpath_max[PATH_MAX];
+ char *s_min, *s_max;
+ uint32_t sys_min_freq = 0, sys_max_freq = 0, base_max_freq = 0;
+ uint32_t i, num_freqs = 0;
+
+ snprintf(fullpath_max, sizeof(fullpath_max),
+ POWER_SYSFILE_BASE_MAX_FREQ,
+ pi->lcore_id);
+ snprintf(fullpath_min, sizeof(fullpath_min),
+ POWER_SYSFILE_BASE_MIN_FREQ,
+ pi->lcore_id);
+
+ f_min = fopen(fullpath_min, "r");
+ FOPEN_OR_ERR_RET(f_min, ret);
+
+ f_max = fopen(fullpath_max, "r");
+ FOPEN_OR_ERR_RET(f_max, ret);
+
+ s_min = fgets(buf_min, sizeof(buf_min), f_min);
+ FOPS_OR_NULL_GOTO(s_min, out);
+
+ s_max = fgets(buf_max, sizeof(buf_max), f_max);
+ FOPS_OR_NULL_GOTO(s_max, out);
+
+
+ /* Strip the line break if there is */
+ p_min = strchr(buf_min, '\n');
+ if (p_min != NULL)
+ *p_min = 0;
+
+ p_max = strchr(buf_max, '\n');
+ if (p_max != NULL)
+ *p_max = 0;
+
+ sys_min_freq = strtoul(buf_min, &p_min, POWER_CONVERT_TO_DECIMAL);
+ sys_max_freq = strtoul(buf_max, &p_max, POWER_CONVERT_TO_DECIMAL);
+
+ if (sys_max_freq < sys_min_freq)
+ goto out;
+
+ pi->sys_max_freq = sys_max_freq;
+
+ base_max_freq = pi->non_turbo_max_ratio * BUS_FREQ;
+
+ POWER_DEBUG_TRACE("sys min %u, sys max %u, base_max %u\n",
+ sys_min_freq,
+ sys_max_freq,
+ base_max_freq);
+
+ if (base_max_freq < sys_max_freq)
+ pi->turbo_available = 1;
+ else
+ pi->turbo_available = 0;
+
+ /* If turbo is available then there is one extra freq bucket
+ * to store the sys max freq which value is base_max +1
+ */
+ num_freqs = (base_max_freq - sys_min_freq) / BUS_FREQ + 1 +
+ pi->turbo_available;
+
+ /* Generate the freq bucket array.
+ * If turbo is available the freq bucket[0] value is base_max +1
+ * the bucket[1] is base_max, bucket[2] is base_max - BUS_FREQ
+ * and so on.
+ * If turbo is not available bucket[0] is base_max and so on
+ */
+ for (i = 0, pi->nb_freqs = 0; i < num_freqs; i++) {
+ if ((i == 0) && pi->turbo_available)
+ pi->freqs[pi->nb_freqs++] = base_max_freq + 1;
+ else
+ pi->freqs[pi->nb_freqs++] =
+ base_max_freq - (i - pi->turbo_available) * BUS_FREQ;
+ }
+
+ ret = 0;
+
+ POWER_DEBUG_TRACE("%d frequency(s) of lcore %u are available\n",
+ num_freqs, pi->lcore_id);
+
+out:
+ fclose(f_min);
+ fclose(f_max);
+
+ return ret;
+}
+
+int
+power_pstate_cpufreq_init(unsigned int lcore_id)
+{
+ struct pstate_power_info *pi;
+
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Lcore id %u can not exceed %u\n",
+ lcore_id, RTE_MAX_LCORE - 1U);
+ return -1;
+ }
+
+ pi = &lcore_power_info[lcore_id];
+ if (rte_atomic32_cmpset(&(pi->state), POWER_IDLE, POWER_ONGOING)
+ == 0) {
+ RTE_LOG(INFO, POWER, "Power management of lcore %u is "
+ "in use\n", lcore_id);
+ return -1;
+ }
+
+ pi->lcore_id = lcore_id;
+ /* Check and set the governor */
+ if (power_set_governor_performance(pi) < 0) {
+ RTE_LOG(ERR, POWER, "Cannot set governor of lcore %u to "
+ "performance\n", lcore_id);
+ goto fail;
+ }
+ /* Init for setting lcore frequency */
+ if (power_init_for_setting_freq(pi) < 0) {
+ RTE_LOG(ERR, POWER, "Cannot init for setting frequency for "
+ "lcore %u\n", lcore_id);
+ goto fail;
+ }
+
+ /* Get the available frequencies */
+ if (power_get_available_freqs(pi) < 0) {
+ RTE_LOG(ERR, POWER, "Cannot get available frequencies of "
+ "lcore %u\n", lcore_id);
+ goto fail;
+ }
+
+
+ /* Set freq to max by default */
+ if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
+ RTE_LOG(ERR, POWER, "Cannot set frequency of lcore %u "
+ "to max\n", lcore_id);
+ goto fail;
+ }
+
+ RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u "
+ "power management\n", lcore_id);
+ rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_USED);
+
+ return 0;
+
+fail:
+ rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN);
+
+ return -1;
+}
+
+int
+power_pstate_cpufreq_exit(unsigned int lcore_id)
+{
+ struct pstate_power_info *pi;
+
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n",
+ lcore_id, RTE_MAX_LCORE - 1U);
+ return -1;
+ }
+ pi = &lcore_power_info[lcore_id];
+
+ if (rte_atomic32_cmpset(&(pi->state), POWER_USED, POWER_ONGOING)
+ == 0) {
+ RTE_LOG(INFO, POWER, "Power management of lcore %u is "
+ "not used\n", lcore_id);
+ return -1;
+ }
+
+ /* Close FD of setting freq */
+ fclose(pi->f_cur_min);
+ fclose(pi->f_cur_max);
+ pi->f_cur_min = NULL;
+ pi->f_cur_max = NULL;
+
+ /* Set the governor back to the original */
+ if (power_set_governor_original(pi) < 0) {
+ RTE_LOG(ERR, POWER, "Cannot set the governor of %u back "
+ "to the original\n", lcore_id);
+ goto fail;
+ }
+
+ RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from "
+ "'performance' mode and been set back to the "
+ "original\n", lcore_id);
+ rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_IDLE);
+
+ return 0;
+
+fail:
+ rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN);
+
+ return -1;
+}
+
+
+uint32_t
+power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num)
+{
+ struct pstate_power_info *pi;
+
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return -1;
+ }
+
+ pi = &lcore_power_info[lcore_id];
+ if (num < pi->nb_freqs) {
+ RTE_LOG(ERR, POWER, "Buffer size is not enough\n");
+ return 0;
+ }
+ rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t));
+
+ return pi->nb_freqs;
+}
+
+uint32_t
+power_pstate_cpufreq_get_freq(unsigned int lcore_id)
+{
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return RTE_POWER_INVALID_FREQ_INDEX;
+ }
+
+ return lcore_power_info[lcore_id].curr_idx;
+}
+
+
+int
+power_pstate_cpufreq_set_freq(unsigned int lcore_id, uint32_t index)
+{
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return -1;
+ }
+
+ return set_freq_internal(&(lcore_power_info[lcore_id]), index);
+}
+
+int
+power_pstate_cpufreq_freq_up(unsigned int lcore_id)
+{
+ struct pstate_power_info *pi;
+
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return -1;
+ }
+
+ pi = &lcore_power_info[lcore_id];
+ if (pi->curr_idx == 0)
+ return 0;
+
+ /* Frequencies in the array are from high to low. */
+ return set_freq_internal(pi, pi->curr_idx - 1);
+}
+
+int
+power_pstate_cpufreq_freq_down(unsigned int lcore_id)
+{
+ struct pstate_power_info *pi;
+
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return -1;
+ }
+
+ pi = &lcore_power_info[lcore_id];
+ if (pi->curr_idx + 1 == pi->nb_freqs)
+ return 0;
+
+ /* Frequencies in the array are from high to low. */
+ return set_freq_internal(pi, pi->curr_idx + 1);
+}
+
+int
+power_pstate_cpufreq_freq_max(unsigned int lcore_id)
+{
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return -1;
+ }
+
+ /* Frequencies in the array are from high to low. */
+ if (lcore_power_info[lcore_id].turbo_available) {
+ if (lcore_power_info[lcore_id].turbo_enable)
+ /* Set to Turbo */
+ return set_freq_internal(
+ &lcore_power_info[lcore_id], 0);
+ else
+ /* Set to max non-turbo */
+ return set_freq_internal(
+ &lcore_power_info[lcore_id], 1);
+ } else
+ return set_freq_internal(&lcore_power_info[lcore_id], 0);
+}
+
+
+int
+power_pstate_cpufreq_freq_min(unsigned int lcore_id)
+{
+ struct pstate_power_info *pi;
+
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return -1;
+ }
+
+ pi = &lcore_power_info[lcore_id];
+
+ /* Frequencies in the array are from high to low. */
+ return set_freq_internal(pi, pi->nb_freqs - 1);
+}
+
+
+int
+power_pstate_turbo_status(unsigned int lcore_id)
+{
+ struct pstate_power_info *pi;
+
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return -1;
+ }
+
+ pi = &lcore_power_info[lcore_id];
+
+ return pi->turbo_enable;
+}
+
+int
+power_pstate_enable_turbo(unsigned int lcore_id)
+{
+ struct pstate_power_info *pi;
+
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return -1;
+ }
+
+ pi = &lcore_power_info[lcore_id];
+
+ if (pi->turbo_available)
+ pi->turbo_enable = 1;
+ else {
+ pi->turbo_enable = 0;
+ RTE_LOG(ERR, POWER,
+ "Failed to enable turbo on lcore %u\n",
+ lcore_id);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+int
+power_pstate_disable_turbo(unsigned int lcore_id)
+{
+ struct pstate_power_info *pi;
+
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return -1;
+ }
+
+ pi = &lcore_power_info[lcore_id];
+
+ pi->turbo_enable = 0;
+
+
+ return 0;
+}
+
+
+int power_pstate_get_capabilities(unsigned int lcore_id,
+ struct rte_power_core_capabilities *caps)
+{
+ struct pstate_power_info *pi;
+
+ if (lcore_id >= RTE_MAX_LCORE) {
+ RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
+ return -1;
+ }
+ if (caps == NULL) {
+ RTE_LOG(ERR, POWER, "Invalid argument\n");
+ return -1;
+ }
+
+ pi = &lcore_power_info[lcore_id];
+ caps->capabilities = 0;
+ caps->turbo = !!(pi->turbo_available);
+
+ return 0;
+}
diff --git a/lib/librte_power/power_pstate_cpufreq.h b/lib/librte_power/power_pstate_cpufreq.h
new file mode 100644
index 0000000000..6fd801881f
--- /dev/null
+++ b/lib/librte_power/power_pstate_cpufreq.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _POWER_PSTATE_CPUFREQ_H
+#define _POWER_PSTATE_CPUFREQ_H
+
+/**
+ * @file
+ * RTE Power Management via Intel Pstate driver
+ */
+
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include "rte_power.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Initialize power management for a specific lcore. It will check and set the
+ * governor to performance for the lcore, get the available frequencies, and
+ * prepare to set new lcore frequency.
+ *
+ * @param lcore_id
+ * lcore id.
+ *
+ * @return
+ * - 0 on success.
+ * - Negative on error.
+ */
+int power_pstate_cpufreq_init(unsigned int lcore_id);
+
+/**
+ * Exit power management on a specific lcore. It will set the governor to which
+ * is before initialized.
+ *
+ * @param lcore_id
+ * lcore id.
+ *
+ * @return
+ * - 0 on success.
+ * - Negative on error.
+ */
+int power_pstate_cpufreq_exit(unsigned int lcore_id);
+
+/**
+ * Get the available frequencies of a specific lcore. The return value will be
+ * the minimal one of the total number of available frequencies and the number
+ * of buffer. The index of available frequencies used in other interfaces
+ * should be in the range of 0 to this return value.
+ * It should be protected outside of this function for threadsafe.
+ *
+ * @param lcore_id
+ * lcore id.
+ * @param freqs
+ * The buffer array to save the frequencies.
+ * @param num
+ * The number of frequencies to get.
+ *
+ * @return
+ * The number of available frequencies.
+ */
+uint32_t power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs,
+ uint32_t num);
+
+/**
+ * Return the current index of available frequencies of a specific lcore.
+ * It should be protected outside of this function for threadsafe.
+ *
+ * @param lcore_id
+ * lcore id.
+ *
+ * @return
+ * The current index of available frequencies.
+ * If error, it will return 'RTE_POWER_INVALID_FREQ_INDEX = (~0)'.
+ */
+uint32_t power_pstate_cpufreq_get_freq(unsigned int lcore_id);
+
+/**
+ * Set the new frequency for a specific lcore by indicating the index of
+ * available frequencies.
+ * It should be protected outside of this function for threadsafe.
+ *
+ * @param lcore_id
+ * lcore id.
+ * @param index
+ * The index of available frequencies.
+ *
+ * @return
+ * - 1 on success with frequency changed.
+ * - 0 on success without frequency changed.
+ * - Negative on error.
+ */
+int power_pstate_cpufreq_set_freq(unsigned int lcore_id, uint32_t index);
+
+/**
+ * Scale up the frequency of a specific lcore according to the available
+ * frequencies.
+ * It should be protected outside of this function for threadsafe.
+ *
+ * @param lcore_id
+ * lcore id.
+ *
+ * @return
+ * - 1 on success with frequency changed.
+ * - 0 on success without frequency changed.
+ * - Negative on error.
+ */
+int power_pstate_cpufreq_freq_up(unsigned int lcore_id);
+
+/**
+ * Scale down the frequency of a specific lcore according to the available
+ * frequencies.
+ * It should be protected outside of this function for threadsafe.
+ *
+ * @param lcore_id
+ * lcore id.
+ *
+ * @return
+ * - 1 on success with frequency changed.
+ * - 0 on success without frequency changed.
+ * - Negative on error.
+ */
+int power_pstate_cpufreq_freq_down(unsigned int lcore_id);
+
+/**
+ * Scale up the frequency of a specific lcore to the highest according to the
+ * available frequencies.
+ * It should be protected outside of this function for threadsafe.
+ *
+ * @param lcore_id
+ * lcore id.
+ *
+ * @return
+ * - 1 on success with frequency changed.
+ * - 0 on success without frequency changed.
+ * - Negative on error.
+ */
+int power_pstate_cpufreq_freq_max(unsigned int lcore_id);
+
+/**
+ * Scale down the frequency of a specific lcore to the lowest according to the
+ * available frequencies.
+ * It should be protected outside of this function for threadsafe.
+ *
+ * @param lcore_id
+ * lcore id.
+ *
+ * @return
+ * - 1 on success with frequency changed.
+ * - 0 on success without frequency changed.
+ * - Negative on error.
+ */
+int power_pstate_cpufreq_freq_min(unsigned int lcore_id);
+
+/**
+ * Get the turbo status of a specific lcore.
+ * It should be protected outside of this function for threadsafe.
+ *
+ * @param lcore_id
+ * lcore id.
+ *
+ * @return
+ * - 1 Turbo Boost is enabled on this lcore.
+ * - 0 Turbo Boost is disabled on this lcore.
+ * - Negative on error.
+ */
+int power_pstate_turbo_status(unsigned int lcore_id);
+
+/**
+ * Enable Turbo Boost on a specific lcore.
+ * It should be protected outside of this function for threadsafe.
+ *
+ * @param lcore_id
+ * lcore id.
+ *
+ * @return
+ * - 0 Turbo Boost is enabled successfully on this lcore.
+ * - Negative on error.
+ */
+int power_pstate_enable_turbo(unsigned int lcore_id);
+
+/**
+ * Disable Turbo Boost on a specific lcore.
+ * It should be protected outside of this function for threadsafe.
+ *
+ * @param lcore_id
+ * lcore id.
+ *
+ * @return
+ * - 0 Turbo Boost disabled successfully on this lcore.
+ * - Negative on error.
+ */
+int power_pstate_disable_turbo(unsigned int lcore_id);
+
+/**
+ * Returns power capabilities for a specific lcore.
+ *
+ * @param lcore_id
+ * lcore id.
+ * @param caps
+ * pointer to rte_power_core_capabilities object.
+ *
+ * @return
+ * - 0 on success.
+ * - Negative on error.
+ */
+int power_pstate_get_capabilities(unsigned int lcore_id,
+ struct rte_power_core_capabilities *caps);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/librte_power/rte_power.c b/lib/librte_power/rte_power.c
index 208b791950..a05fbef947 100644
--- a/lib/librte_power/rte_power.c
+++ b/lib/librte_power/rte_power.c
@@ -7,6 +7,7 @@
#include "rte_power.h"
#include "power_acpi_cpufreq.h"
#include "power_kvm_vm.h"
+#include "power_pstate_cpufreq.h"
#include "power_common.h"
enum power_management_env global_default_env = PM_ENV_NOT_SET;
@@ -56,6 +57,19 @@ rte_power_set_env(enum power_management_env env)
rte_power_freq_enable_turbo = power_kvm_vm_enable_turbo;
rte_power_freq_disable_turbo = power_kvm_vm_disable_turbo;
rte_power_get_capabilities = power_kvm_vm_get_capabilities;
+ } else if (env == PM_ENV_PSTATE_CPUFREQ) {
+ rte_power_freqs = power_pstate_cpufreq_freqs;
+ rte_power_get_freq = power_pstate_cpufreq_get_freq;
+ rte_power_set_freq = power_pstate_cpufreq_set_freq;
+ rte_power_freq_up = power_pstate_cpufreq_freq_up;
+ rte_power_freq_down = power_pstate_cpufreq_freq_down;
+ rte_power_freq_min = power_pstate_cpufreq_freq_min;
+ rte_power_freq_max = power_pstate_cpufreq_freq_max;
+ rte_power_turbo_status = power_pstate_turbo_status;
+ rte_power_freq_enable_turbo = power_pstate_enable_turbo;
+ rte_power_freq_disable_turbo = power_pstate_disable_turbo;
+ rte_power_get_capabilities = power_pstate_get_capabilities;
+
} else {
RTE_LOG(ERR, POWER, "Invalid Power Management Environment(%d) set\n",
env);
@@ -64,7 +78,6 @@ rte_power_set_env(enum power_management_env env)
}
global_default_env = env;
return 0;
-
}
void
@@ -84,21 +97,32 @@ rte_power_init(unsigned int lcore_id)
{
int ret = -1;
- if (global_default_env == PM_ENV_ACPI_CPUFREQ) {
+ switch (global_default_env) {
+ case PM_ENV_ACPI_CPUFREQ:
return power_acpi_cpufreq_init(lcore_id);
- }
- if (global_default_env == PM_ENV_KVM_VM) {
+ case PM_ENV_KVM_VM:
return power_kvm_vm_init(lcore_id);
+ case PM_ENV_PSTATE_CPUFREQ:
+ return power_pstate_cpufreq_init(lcore_id);
+ default:
+ RTE_LOG(INFO, POWER, "Env isn't set yet!\n");
}
+
/* Auto detect Environment */
- RTE_LOG(INFO, POWER, "Attempting to initialise ACPI cpufreq power "
- "management...\n");
+ RTE_LOG(INFO, POWER, "Attempting to initialise ACPI cpufreq power management...\n");
ret = power_acpi_cpufreq_init(lcore_id);
if (ret == 0) {
rte_power_set_env(PM_ENV_ACPI_CPUFREQ);
goto out;
}
+ RTE_LOG(INFO, POWER, "Attempting to initialise PSTAT power management...\n");
+ ret = power_pstate_cpufreq_init(lcore_id);
+ if (ret == 0) {
+ rte_power_set_env(PM_ENV_PSTATE_CPUFREQ);
+ goto out;
+ }
+
RTE_LOG(INFO, POWER, "Attempting to initialise VM power management...\n");
ret = power_kvm_vm_init(lcore_id);
if (ret == 0) {
@@ -114,13 +138,17 @@ out:
int
rte_power_exit(unsigned int lcore_id)
{
- if (global_default_env == PM_ENV_ACPI_CPUFREQ)
+ switch (global_default_env) {
+ case PM_ENV_ACPI_CPUFREQ:
return power_acpi_cpufreq_exit(lcore_id);
- if (global_default_env == PM_ENV_KVM_VM)
+ case PM_ENV_KVM_VM:
return power_kvm_vm_exit(lcore_id);
+ case PM_ENV_PSTATE_CPUFREQ:
+ return power_pstate_cpufreq_exit(lcore_id);
+ default:
+ RTE_LOG(ERR, POWER, "Environment has not been set, unable to exit gracefully\n");
- RTE_LOG(ERR, POWER, "Environment has not been set, unable to exit "
- "gracefully\n");
+ }
return -1;
}
diff --git a/lib/librte_power/rte_power.h b/lib/librte_power/rte_power.h
index d70bc0b332..c5e8f6b5bc 100644
--- a/lib/librte_power/rte_power.h
+++ b/lib/librte_power/rte_power.h
@@ -20,7 +20,8 @@ extern "C" {
#endif
/* Power Management Environment State */
-enum power_management_env {PM_ENV_NOT_SET, PM_ENV_ACPI_CPUFREQ, PM_ENV_KVM_VM};
+enum power_management_env {PM_ENV_NOT_SET, PM_ENV_ACPI_CPUFREQ, PM_ENV_KVM_VM,
+ PM_ENV_PSTATE_CPUFREQ};
/**
* Set the default power management implementation. If this is not called prior