summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bpf/btf.c9
-rw-r--r--kernel/bpf/cpumap.c6
-rw-r--r--kernel/bpf/verifier.c41
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-v1.c17
-rw-r--r--kernel/cgroup/cgroup.c68
-rw-r--r--kernel/cgroup/cpuset-internal.h305
-rw-r--r--kernel/cgroup/cpuset-v1.c562
-rw-r--r--kernel/cgroup/cpuset.c1193
-rw-r--r--kernel/cgroup/pids.c32
-rw-r--r--kernel/context_tracking.c140
-rw-r--r--kernel/cpu.c28
-rw-r--r--kernel/crash_reserve.c3
-rw-r--r--kernel/dma/debug.c5
-rw-r--r--kernel/entry/common.c2
-rw-r--r--kernel/events/core.c30
-rw-r--r--kernel/events/internal.h1
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/events/uprobes.c3
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/cpuhotplug.c4
-rw-r--r--kernel/irq/irq_sim.c1
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/irqdomain.c210
-rw-r--r--kernel/irq/manage.c21
-rw-r--r--kernel/irq/migration.c4
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/irq/proc.c17
-rw-r--r--kernel/jump_label.c4
-rw-r--r--kernel/kallsyms.c55
-rw-r--r--kernel/kallsyms_selftest.c22
-rw-r--r--kernel/kcov.c46
-rw-r--r--kernel/kexec_file.c2
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/ksysfs.c7
-rw-r--r--kernel/locking/lockdep.c89
-rw-r--r--kernel/locking/qspinlock_paravirt.h2
-rw-r--r--kernel/locking/rtmutex.c9
-rw-r--r--kernel/module/Makefile2
-rw-r--r--kernel/module/main.c41
-rw-r--r--kernel/padata.c13
-rw-r--r--kernel/panic.c17
-rw-r--r--kernel/power/hibernate.c26
-rw-r--r--kernel/power/main.c76
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/printk/internal.h207
-rw-r--r--kernel/printk/nbcon.c934
-rw-r--r--kernel/printk/printk.c715
-rw-r--r--kernel/printk/printk_ringbuffer.h7
-rw-r--r--kernel/printk/printk_safe.c25
-rw-r--r--kernel/profile.c242
-rw-r--r--kernel/rcu/rcu.h12
-rw-r--r--kernel/rcu/rcu_segcblist.c11
-rw-r--r--kernel/rcu/rcu_segcblist.h11
-rw-r--r--kernel/rcu/rcuscale.c214
-rw-r--r--kernel/rcu/rcutorture.c121
-rw-r--r--kernel/rcu/refscale.c67
-rw-r--r--kernel/rcu/srcutree.c11
-rw-r--r--kernel/rcu/tasks.h214
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c176
-rw-r--r--kernel/rcu/tree.h10
-rw-r--r--kernel/rcu/tree_exp.h128
-rw-r--r--kernel/rcu/tree_nocb.h279
-rw-r--r--kernel/rcu/tree_plugin.h11
-rw-r--r--kernel/rcu/tree_stall.h25
-rw-r--r--kernel/resource.c6
-rw-r--r--kernel/sched/core.c74
-rw-r--r--kernel/sched/cputime.c6
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sched/stats.c10
-rw-r--r--kernel/sched/syscalls.c8
-rw-r--r--kernel/signal.c34
-rw-r--r--kernel/smp.c38
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/task_work.c6
-rw-r--r--kernel/time/alarmtimer.c9
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c47
-rw-r--r--kernel/time/hrtimer.c24
-rw-r--r--kernel/time/ntp.c19
-rw-r--r--kernel/time/ntp_internal.h4
-rw-r--r--kernel/time/posix-cpu-timers.c207
-rw-r--r--kernel/time/posix-timers.c73
-rw-r--r--kernel/time/posix-timers.h3
-rw-r--r--kernel/time/tick-broadcast.c3
-rw-r--r--kernel/time/timekeeping.c6
-rw-r--r--kernel/time/timer.c64
-rw-r--r--kernel/trace/fgraph.c33
-rw-r--r--kernel/trace/preemptirq_delay_test.c2
-rw-r--r--kernel/trace/ring_buffer.c12
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace.h23
-rw-r--r--kernel/trace/trace_events.c41
-rw-r--r--kernel/trace/trace_events_hist.c4
-rw-r--r--kernel/trace/trace_events_inject.c2
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_osnoise.c54
-rw-r--r--kernel/trace/trace_selftest.c23
-rw-r--r--kernel/trace/tracing_map.c6
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/workqueue.c165
108 files changed, 4823 insertions, 2789 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index e7a62ebbf4d1..1edaa4846a47 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1612,7 +1612,7 @@ static void audit_log_multicast(int group, const char *op, int err)
cred = current_cred();
tty = audit_get_tty();
audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u",
- task_pid_nr(current),
+ task_tgid_nr(current),
from_kuid(&init_user_ns, cred->uid),
from_kuid(&init_user_ns, audit_get_loginuid(current)),
tty ? tty_name(tty) : "(none)",
@@ -1706,7 +1706,7 @@ static int __init audit_init(void)
audit_cmd_mutex.owner = NULL;
pr_info("initializing netlink subsys (%s)\n",
- audit_default ? "enabled" : "disabled");
+ str_enabled_disabled(audit_default));
register_pernet_subsys(&audit_net_ops);
audit_initialized = AUDIT_INITIALIZED;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d6ef4f4f9cba..470041c49a44 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1344,7 +1344,7 @@ int audit_filter(int msgtype, unsigned int listtype)
switch (f->type) {
case AUDIT_PID:
- pid = task_pid_nr(current);
+ pid = task_tgid_nr(current);
result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_UID:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6f0d6fb6523f..cd57053b4a69 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2933,7 +2933,7 @@ void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
name, af, nentries, audit_nfcfgs[op].s);
- audit_log_format(ab, " pid=%u", task_pid_nr(current));
+ audit_log_format(ab, " pid=%u", task_tgid_nr(current));
audit_log_task_context(ab); /* subj= */
audit_log_format(ab, " comm=");
audit_log_untrustedstring(ab, get_task_comm(comm, current));
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 520f49f422fe..ba91be08763a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -823,9 +823,11 @@ static bool btf_name_valid_section(const struct btf *btf, u32 offset)
const char *src = btf_str_by_offset(btf, offset);
const char *src_limit;
+ if (!*src)
+ return false;
+
/* set a limit on identifier length */
src_limit = src + KSYM_NAME_LEN;
- src++;
while (*src && src < src_limit) {
if (!isprint(*src))
return false;
@@ -6283,7 +6285,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data,
errout:
btf_verifier_env_free(env);
- if (base_btf != vmlinux_btf)
+ if (!IS_ERR(base_btf) && base_btf != vmlinux_btf)
btf_free(base_btf);
if (btf) {
kvfree(btf->data);
@@ -6523,6 +6525,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (prog_args_trusted(prog))
info->reg_type |= PTR_TRUSTED;
+ if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
+ info->reg_type |= PTR_MAYBE_NULL;
+
if (tgt_prog) {
enum bpf_prog_type tgt_type;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index fbdf5a1aabfe..a2f46785ac3b 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -354,12 +354,14 @@ static int cpu_map_kthread_run(void *data)
list_add_tail(&skb->list, &list);
}
- netif_receive_skb_list(&list);
- /* Feedback loop via tracepoint */
+ /* Feedback loop via tracepoint.
+ * NB: keep before recv to allow measuring enqueue/dequeue latency.
+ */
trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
sched, &stats);
+ netif_receive_skb_list(&list);
local_bh_enable(); /* resched point, may call do_softirq() */
}
__set_current_state(TASK_RUNNING);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4cb5441ad75f..39d5710c68ad 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -28,6 +28,8 @@
#include <linux/cpumask.h>
#include <linux/bpf_mem_alloc.h>
#include <net/xdp.h>
+#include <linux/trace_events.h>
+#include <linux/kallsyms.h>
#include "disasm.h"
@@ -16884,8 +16886,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
spi = i / BPF_REG_SIZE;
if (exact != NOT_EXACT &&
- old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
- cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ (i >= cur->allocated_stack ||
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
return false;
if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)
@@ -21153,11 +21156,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
{
bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
+ char trace_symbol[KSYM_SYMBOL_LEN];
const char prefix[] = "btf_trace_";
+ struct bpf_raw_event_map *btp;
int ret = 0, subprog = -1, i;
const struct btf_type *t;
bool conservative = true;
- const char *tname;
+ const char *tname, *fname;
struct btf *btf;
long addr = 0;
struct module *mod = NULL;
@@ -21288,10 +21293,34 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
return -EINVAL;
}
tname += sizeof(prefix) - 1;
- t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_ptr(t))
- /* should never happen in valid vmlinux build */
+
+ /* The func_proto of "btf_trace_##tname" is generated from typedef without argument
+ * names. Thus using bpf_raw_event_map to get argument names.
+ */
+ btp = bpf_get_raw_tracepoint(tname);
+ if (!btp)
return -EINVAL;
+ fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL,
+ trace_symbol);
+ bpf_put_raw_tracepoint(btp);
+
+ if (fname)
+ ret = btf_find_by_name_kind(btf, fname, BTF_KIND_FUNC);
+
+ if (!fname || ret < 0) {
+ bpf_log(log, "Cannot find btf of tracepoint template, fall back to %s%s.\n",
+ prefix, tname);
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_ptr(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ } else {
+ t = btf_type_by_id(btf, ret);
+ if (!btf_type_is_func(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ }
+
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
/* should never happen in valid vmlinux build */
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 12f8457ad1f9..a5c9359d516f 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -5,5 +5,6 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o
obj-$(CONFIG_CGROUP_MISC) += misc.o
obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index b9dbf6bf2779..e28d5f0d20ed 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -46,6 +46,12 @@ bool cgroup1_ssid_disabled(int ssid)
return cgroup_no_v1_mask & (1 << ssid);
}
+static bool cgroup1_subsys_absent(struct cgroup_subsys *ss)
+{
+ /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */
+ return ss->legacy_cftypes == NULL && ss->dfl_cftypes;
+}
+
/**
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task
@@ -675,11 +681,14 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
* cgroup_mutex contention.
*/
- for_each_subsys(ss, i)
+ for_each_subsys(ss, i) {
+ if (cgroup1_subsys_absent(ss))
+ continue;
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
+ }
return 0;
}
@@ -932,7 +941,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
if (ret != -ENOPARAM)
return ret;
for_each_subsys(ss, i) {
- if (strcmp(param->key, ss->legacy_name))
+ if (strcmp(param->key, ss->legacy_name) ||
+ cgroup1_subsys_absent(ss))
continue;
if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
return invalfc(fc, "Disabled controller '%s'",
@@ -1024,7 +1034,8 @@ static int check_cgroupfs_options(struct fs_context *fc)
mask = ~((u16)1 << cpuset_cgrp_id);
#endif
for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
+ !cgroup1_subsys_absent(ss))
enabled |= 1 << i;
ctx->subsys_mask &= enabled;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c8e4b62b436a..90e50d6d3cf3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2331,7 +2331,7 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cgroup1_get_tree,
.free = cgroup_fs_context_free,
@@ -3669,12 +3669,40 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
static int cgroup_stat_show(struct seq_file *seq, void *v)
{
struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct cgroup_subsys_state *css;
+ int dying_cnt[CGROUP_SUBSYS_COUNT];
+ int ssid;
seq_printf(seq, "nr_descendants %d\n",
cgroup->nr_descendants);
+
+ /*
+ * Show the number of live and dying csses associated with each of
+ * non-inhibited cgroup subsystems that is bound to cgroup v2.
+ *
+ * Without proper lock protection, racing is possible. So the
+ * numbers may not be consistent when that happens.
+ */
+ rcu_read_lock();
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ dying_cnt[ssid] = -1;
+ if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
+ (cgroup_subsys[ssid]->root != &cgrp_dfl_root))
+ continue;
+ css = rcu_dereference_raw(cgroup->subsys[ssid]);
+ dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid];
+ seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
+ css ? (css->nr_descendants + 1) : 0);
+ }
+
seq_printf(seq, "nr_dying_descendants %d\n",
cgroup->nr_dying_descendants);
-
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ if (dying_cnt[ssid] >= 0)
+ seq_printf(seq, "nr_dying_subsys_%s %d\n",
+ cgroup_subsys[ssid]->name, dying_cnt[ssid]);
+ }
+ rcu_read_unlock();
return 0;
}
@@ -4096,7 +4124,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
* except for the files explicitly marked delegatable -
- * cgroup.procs and cgroup.subtree_control.
+ * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
@@ -5424,6 +5452,8 @@ static void css_release_work_fn(struct work_struct *work)
list_del_rcu(&css->sibling);
if (ss) {
+ struct cgroup *parent_cgrp;
+
/* css release path */
if (!list_empty(&css->rstat_css_node)) {
cgroup_rstat_flush(cgrp);
@@ -5433,6 +5463,21 @@ static void css_release_work_fn(struct work_struct *work)
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
+
+ cgrp->nr_dying_subsys[ss->id]--;
+ /*
+ * When a css is released and ready to be freed, its
+ * nr_descendants must be zero. However, the corresponding
+ * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem
+ * is activated and deactivated multiple times with one or
+ * more of its previous activation leaving behind dying csses.
+ */
+ WARN_ON_ONCE(css->nr_descendants);
+ parent_cgrp = cgroup_parent(cgrp);
+ while (parent_cgrp) {
+ parent_cgrp->nr_dying_subsys[ss->id]--;
+ parent_cgrp = cgroup_parent(parent_cgrp);
+ }
} else {
struct cgroup *tcgrp;
@@ -5517,8 +5562,11 @@ static int online_css(struct cgroup_subsys_state *css)
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
atomic_inc(&css->online_cnt);
- if (css->parent)
+ if (css->parent) {
atomic_inc(&css->parent->online_cnt);
+ while ((css = css->parent))
+ css->nr_descendants++;
+ }
}
return ret;
}
@@ -5540,6 +5588,16 @@ static void offline_css(struct cgroup_subsys_state *css)
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
wake_up_all(&css->cgroup->offline_waitq);
+
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ /*
+ * Parent css and cgroup cannot be freed until after the freeing
+ * of child css, see css_free_rwork_fn().
+ */
+ while ((css = css->parent)) {
+ css->nr_descendants--;
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ }
}
/**
@@ -6178,7 +6236,7 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
new file mode 100644
index 000000000000..976a8bc3ff60
--- /dev/null
+++ b/kernel/cgroup/cpuset-internal.h
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __CPUSET_INTERNAL_H
+#define __CPUSET_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/spinlock.h>
+#include <linux/union_find.h>
+
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+ int cnt; /* unprocessed events count */
+ int val; /* most recent output value */
+ time64_t time; /* clock (secs) when val computed */
+ spinlock_t lock; /* guards read or write of above */
+};
+
+/*
+ * Invalid partition error code
+ */
+enum prs_errcode {
+ PERR_NONE = 0,
+ PERR_INVCPUS,
+ PERR_INVPARENT,
+ PERR_NOTPART,
+ PERR_NOTEXCL,
+ PERR_NOCPUS,
+ PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
+ PERR_HKEEPING,
+ PERR_ACCESS,
+};
+
+/* bits in struct cpuset flags field */
+typedef enum {
+ CS_ONLINE,
+ CS_CPU_EXCLUSIVE,
+ CS_MEM_EXCLUSIVE,
+ CS_MEM_HARDWALL,
+ CS_MEMORY_MIGRATE,
+ CS_SCHED_LOAD_BALANCE,
+ CS_SPREAD_PAGE,
+ CS_SPREAD_SLAB,
+} cpuset_flagbits_t;
+
+/* The various types of files and directories in a cpuset file system */
+
+typedef enum {
+ FILE_MEMORY_MIGRATE,
+ FILE_CPULIST,
+ FILE_MEMLIST,
+ FILE_EFFECTIVE_CPULIST,
+ FILE_EFFECTIVE_MEMLIST,
+ FILE_SUBPARTS_CPULIST,
+ FILE_EXCLUSIVE_CPULIST,
+ FILE_EFFECTIVE_XCPULIST,
+ FILE_ISOLATED_CPULIST,
+ FILE_CPU_EXCLUSIVE,
+ FILE_MEM_EXCLUSIVE,
+ FILE_MEM_HARDWALL,
+ FILE_SCHED_LOAD_BALANCE,
+ FILE_PARTITION_ROOT,
+ FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ FILE_MEMORY_PRESSURE_ENABLED,
+ FILE_MEMORY_PRESSURE,
+ FILE_SPREAD_PAGE,
+ FILE_SPREAD_SLAB,
+} cpuset_filetype_t;
+
+struct cpuset {
+ struct cgroup_subsys_state css;
+
+ unsigned long flags; /* "unsigned long" so bitops work */
+
+ /*
+ * On default hierarchy:
+ *
+ * The user-configured masks can only be changed by writing to
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
+ * parent masks.
+ *
+ * The effective masks is the real masks that apply to the tasks
+ * in the cpuset. They may be changed if the configured masks are
+ * changed or hotplug happens.
+ *
+ * effective_mask == configured_mask & parent's effective_mask,
+ * and if it ends up empty, it will inherit the parent's mask.
+ *
+ *
+ * On legacy hierarchy:
+ *
+ * The user-configured masks are always the same with effective masks.
+ */
+
+ /* user-configured CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t cpus_allowed;
+ nodemask_t mems_allowed;
+
+ /* effective CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t effective_cpus;
+ nodemask_t effective_mems;
+
+ /*
+ * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
+ *
+ * The effective_cpus of a valid partition root comes solely from its
+ * effective_xcpus and some of the effective_xcpus may be distributed
+ * to sub-partitions below & hence excluded from its effective_cpus.
+ * For a valid partition root, its effective_cpus have no relationship
+ * with cpus_allowed unless its exclusive_cpus isn't set.
+ *
+ * This value will only be set if either exclusive_cpus is set or
+ * when this cpuset becomes a local partition root.
+ */
+ cpumask_var_t effective_xcpus;
+
+ /*
+ * Exclusive CPUs as requested by the user (default hierarchy only)
+ *
+ * Its value is independent of cpus_allowed and designates the set of
+ * CPUs that can be granted to the current cpuset or its children when
+ * it becomes a valid partition root. The effective set of exclusive
+ * CPUs granted (effective_xcpus) depends on whether those exclusive
+ * CPUs are passed down by its ancestors and not yet taken up by
+ * another sibling partition root along the way.
+ *
+ * If its value isn't set, it defaults to cpus_allowed.
+ */
+ cpumask_var_t exclusive_cpus;
+
+ /*
+ * This is old Memory Nodes tasks took on.
+ *
+ * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+ * - A new cpuset's old_mems_allowed is initialized when some
+ * task is moved into it.
+ * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+ * cpuset.mems_allowed and have tasks' nodemask updated, and
+ * then old_mems_allowed is updated to mems_allowed.
+ */
+ nodemask_t old_mems_allowed;
+
+ struct fmeter fmeter; /* memory_pressure filter */
+
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
+ /* for custom sched domain */
+ int relax_domain_level;
+
+ /* number of valid local child partitions */
+ int nr_subparts;
+
+ /* partition root state */
+ int partition_root_state;
+
+ /*
+ * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
+ * know when to rebuild associated root domain bandwidth information.
+ */
+ int nr_deadline_tasks;
+ int nr_migrate_dl_tasks;
+ u64 sum_migrate_dl_bw;
+
+ /* Invalid partition error code, not lock protected */
+ enum prs_errcode prs_err;
+
+ /* Handle for cpuset.cpus.partition */
+ struct cgroup_file partition_file;
+
+ /* Remote partition silbling list anchored at remote_children */
+ struct list_head remote_sibling;
+
+ /* Used to merge intersecting subsets for generate_sched_domains */
+ struct uf_node node;
+};
+
+static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cpuset, css) : NULL;
+}
+
+/* Retrieve the cpuset for a task */
+static inline struct cpuset *task_cs(struct task_struct *task)
+{
+ return css_cs(task_css(task, cpuset_cgrp_id));
+}
+
+static inline struct cpuset *parent_cs(struct cpuset *cs)
+{
+ return css_cs(cs->css.parent);
+}
+
+/* convenient tests for these bits */
+static inline bool is_cpuset_online(struct cpuset *cs)
+{
+ return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
+}
+
+static inline int is_cpu_exclusive(const struct cpuset *cs)
+{
+ return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_exclusive(const struct cpuset *cs)
+{
+ return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_hardwall(const struct cpuset *cs)
+{
+ return test_bit(CS_MEM_HARDWALL, &cs->flags);
+}
+
+static inline int is_sched_load_balance(const struct cpuset *cs)
+{
+ return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+}
+
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+ return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
+static inline int is_spread_page(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_SLAB, &cs->flags);
+}
+
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_css: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
+ css_for_each_child((pos_css), &(parent_cs)->css) \
+ if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_css by calling
+ * css_rightmost_descendant() to skip subtree. @root_cs is included in the
+ * iteration and the first node to be visited.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
+ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
+ if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
+
+void rebuild_sched_domains_locked(void);
+void cpuset_callback_lock_irq(void);
+void cpuset_callback_unlock_irq(void);
+void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus);
+void cpuset_update_tasks_nodemask(struct cpuset *cs);
+int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on);
+ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int cpuset_common_seq_show(struct seq_file *sf, void *v);
+
+/*
+ * cpuset-v1.c
+ */
+#ifdef CONFIG_CPUSETS_V1
+extern struct cftype cpuset1_files[];
+void fmeter_init(struct fmeter *fmp);
+void cpuset1_update_task_spread_flags(struct cpuset *cs,
+ struct task_struct *tsk);
+void cpuset1_update_tasks_flags(struct cpuset *cs);
+void cpuset1_hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated);
+int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
+#else
+static inline void fmeter_init(struct fmeter *fmp) {}
+static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
+ struct task_struct *tsk) {}
+static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {}
+static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated) {}
+static inline int cpuset1_validate_change(struct cpuset *cur,
+ struct cpuset *trial) { return 0; }
+#endif /* CONFIG_CPUSETS_V1 */
+
+#endif /* __CPUSET_INTERNAL_H */
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
new file mode 100644
index 000000000000..25c1d7b77e2f
--- /dev/null
+++ b/kernel/cgroup/cpuset-v1.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "cpuset-internal.h"
+
+/*
+ * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
+ */
+struct cpuset_remove_tasks_struct {
+ struct work_struct work;
+ struct cpuset *cs;
+};
+
+/*
+ * Frequency meter - How fast is some event occurring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter. There are four routines:
+ * fmeter_init() - initialize a frequency meter.
+ * fmeter_markevent() - called each time the event happens.
+ * fmeter_getrate() - returns the recent rate of such events.
+ * fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR). The time unit
+ * is 1 second. Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter. If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds. At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000. At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event. At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933 /* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
+#define FM_SCALE 1000 /* faux fixed point scale */
+
+/* Initialize a frequency meter */
+void fmeter_init(struct fmeter *fmp)
+{
+ fmp->cnt = 0;
+ fmp->val = 0;
+ fmp->time = 0;
+ spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+ time64_t now;
+ u32 ticks;
+
+ now = ktime_get_seconds();
+ ticks = now - fmp->time;
+
+ if (ticks == 0)
+ return;
+
+ ticks = min(FM_MAXTICKS, ticks);
+ while (ticks-- > 0)
+ fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+ fmp->time = now;
+
+ fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+ fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+ spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+ int val;
+
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ val = fmp->val;
+ spin_unlock(&fmp->lock);
+ return val;
+}
+
+/*
+ * Collection of memory_pressure is suppressed unless
+ * this flag is enabled by writing "1" to the special
+ * cpuset file 'memory_pressure_enabled' in the root cpuset.
+ */
+
+int cpuset_memory_pressure_enabled __read_mostly;
+
+/*
+ * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ *
+ * Keep a running average of the rate of synchronous (direct)
+ * page reclaim efforts initiated by tasks in each cpuset.
+ *
+ * This represents the rate at which some task in the cpuset
+ * ran low on memory on all nodes it was allowed to use, and
+ * had to enter the kernels page reclaim code in an effort to
+ * create more free memory by tossing clean pages or swapping
+ * or writing dirty pages.
+ *
+ * Display to user space in the per-cpuset read-only file
+ * "memory_pressure". Value displayed is an integer
+ * representing the recent rate of entry into the synchronous
+ * (direct) page reclaim by any task attached to the cpuset.
+ */
+
+void __cpuset_memory_pressure_bump(void)
+{
+ rcu_read_lock();
+ fmeter_markevent(&task_cs(current)->fmeter);
+ rcu_read_unlock();
+}
+
+static int update_relax_domain_level(struct cpuset *cs, s64 val)
+{
+#ifdef CONFIG_SMP
+ if (val < -1 || val > sched_domain_level_max + 1)
+ return -EINVAL;
+#endif
+
+ if (val != cs->relax_domain_level) {
+ cs->relax_domain_level = val;
+ if (!cpumask_empty(cs->cpus_allowed) &&
+ is_sched_load_balance(cs))
+ rebuild_sched_domains_locked();
+ }
+
+ return 0;
+}
+
+static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 val)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
+
+ cpus_read_lock();
+ cpuset_lock();
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ retval = update_relax_domain_level(cs, val);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+out_unlock:
+ cpuset_unlock();
+ cpus_read_unlock();
+ return retval;
+}
+
+static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ return cs->relax_domain_level;
+ default:
+ BUG();
+ }
+
+ /* Unreachable but makes gcc happy */
+ return 0;
+}
+
+/*
+ * update task's spread flag if cpuset's page/slab spread flag is set
+ *
+ * Call with callback_lock or cpuset_mutex held. The check can be skipped
+ * if on default hierarchy.
+ */
+void cpuset1_update_task_spread_flags(struct cpuset *cs,
+ struct task_struct *tsk)
+{
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
+ if (is_spread_page(cs))
+ task_set_spread_page(tsk);
+ else
+ task_clear_spread_page(tsk);
+
+ if (is_spread_slab(cs))
+ task_set_spread_slab(tsk);
+ else
+ task_clear_spread_slab(tsk);
+}
+
+/**
+ * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ *
+ * Iterate through each task of @cs updating its spread flags. As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
+ */
+void cpuset1_update_tasks_flags(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, 0, &it);
+ while ((task = css_task_iter_next(&it)))
+ cpuset1_update_task_spread_flags(cs, task);
+ css_task_iter_end(&it);
+}
+
+/*
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
+ */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+ struct cpuset *parent;
+
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = parent_cs(cs);
+ while (cpumask_empty(parent->cpus_allowed) ||
+ nodes_empty(parent->mems_allowed))
+ parent = parent_cs(parent);
+
+ if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+ pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
+ pr_cont_cgroup_name(cs->css.cgroup);
+ pr_cont("\n");
+ }
+}
+
+static void cpuset_migrate_tasks_workfn(struct work_struct *work)
+{
+ struct cpuset_remove_tasks_struct *s;
+
+ s = container_of(work, struct cpuset_remove_tasks_struct, work);
+ remove_tasks_in_empty_cpuset(s->cs);
+ css_put(&s->cs->css);
+ kfree(s);
+}
+
+void cpuset1_hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ bool is_empty;
+
+ cpuset_callback_lock_irq();
+ cpumask_copy(cs->cpus_allowed, new_cpus);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->mems_allowed = *new_mems;
+ cs->effective_mems = *new_mems;
+ cpuset_callback_unlock_irq();
+
+ /*
+ * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
+ * as the tasks will be migrated to an ancestor.
+ */
+ if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+ cpuset_update_tasks_cpumask(cs, new_cpus);
+ if (mems_updated && !nodes_empty(cs->mems_allowed))
+ cpuset_update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ /*
+ * Move tasks to the nearest ancestor with execution resources,
+ * This is full cgroup operation which will also call back into
+ * cpuset. Execute it asynchronously using workqueue.
+ */
+ if (is_empty && cs->css.cgroup->nr_populated_csets &&
+ css_tryget_online(&cs->css)) {
+ struct cpuset_remove_tasks_struct *s;
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (WARN_ON_ONCE(!s)) {
+ css_put(&cs->css);
+ return;
+ }
+
+ s->cs = cs;
+ INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
+ schedule_work(&s->work);
+ }
+}
+
+/*
+ * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
+ *
+ * One cpuset is a subset of another if all its allowed CPUs and
+ * Memory Nodes are a subset of the other, and its exclusive flags
+ * are only set if the other's are set. Call holding cpuset_mutex.
+ */
+
+static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
+{
+ return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+ nodes_subset(p->mems_allowed, q->mems_allowed) &&
+ is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
+ is_mem_exclusive(p) <= is_mem_exclusive(q);
+}
+
+/*
+ * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
+ * behavior.
+ */
+int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* Each of our child cpusets must be a subset of us */
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
+
+ /* On legacy hierarchy, we must be a subset of our parent cpuset. */
+ ret = -EACCES;
+ par = parent_cs(cur);
+ if (par && !is_cpuset_subset(trial, par))
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+
+ switch (type) {
+ case FILE_CPU_EXCLUSIVE:
+ return is_cpu_exclusive(cs);
+ case FILE_MEM_EXCLUSIVE:
+ return is_mem_exclusive(cs);
+ case FILE_MEM_HARDWALL:
+ return is_mem_hardwall(cs);
+ case FILE_SCHED_LOAD_BALANCE:
+ return is_sched_load_balance(cs);
+ case FILE_MEMORY_MIGRATE:
+ return is_memory_migrate(cs);
+ case FILE_MEMORY_PRESSURE_ENABLED:
+ return cpuset_memory_pressure_enabled;
+ case FILE_MEMORY_PRESSURE:
+ return fmeter_getrate(&cs->fmeter);
+ case FILE_SPREAD_PAGE:
+ return is_spread_page(cs);
+ case FILE_SPREAD_SLAB:
+ return is_spread_slab(cs);
+ default:
+ BUG();
+ }
+
+ /* Unreachable but makes gcc happy */
+ return 0;
+}
+
+static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+ int retval = 0;
+
+ cpus_read_lock();
+ cpuset_lock();
+ if (!is_cpuset_online(cs)) {
+ retval = -ENODEV;
+ goto out_unlock;
+ }
+
+ switch (type) {
+ case FILE_CPU_EXCLUSIVE:
+ retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
+ break;
+ case FILE_MEM_EXCLUSIVE:
+ retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
+ break;
+ case FILE_MEM_HARDWALL:
+ retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
+ break;
+ case FILE_SCHED_LOAD_BALANCE:
+ retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
+ break;
+ case FILE_MEMORY_MIGRATE:
+ retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
+ break;
+ case FILE_MEMORY_PRESSURE_ENABLED:
+ cpuset_memory_pressure_enabled = !!val;
+ break;
+ case FILE_SPREAD_PAGE:
+ retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
+ break;
+ case FILE_SPREAD_SLAB:
+ retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+out_unlock:
+ cpuset_unlock();
+ cpus_read_unlock();
+ return retval;
+}
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+
+struct cftype cpuset1_files[] = {
+ {
+ .name = "cpus",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_CPULIST,
+ },
+
+ {
+ .name = "mems",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
+ .private = FILE_MEMLIST,
+ },
+
+ {
+ .name = "effective_cpus",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "effective_mems",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
+ .name = "cpu_exclusive",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_CPU_EXCLUSIVE,
+ },
+
+ {
+ .name = "mem_exclusive",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEM_EXCLUSIVE,
+ },
+
+ {
+ .name = "mem_hardwall",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEM_HARDWALL,
+ },
+
+ {
+ .name = "sched_load_balance",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SCHED_LOAD_BALANCE,
+ },
+
+ {
+ .name = "sched_relax_domain_level",
+ .read_s64 = cpuset_read_s64,
+ .write_s64 = cpuset_write_s64,
+ .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ },
+
+ {
+ .name = "memory_migrate",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_MIGRATE,
+ },
+
+ {
+ .name = "memory_pressure",
+ .read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
+ },
+
+ {
+ .name = "memory_spread_page",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SPREAD_PAGE,
+ },
+
+ {
+ /* obsolete, may be removed in the future */
+ .name = "memory_spread_slab",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SPREAD_SLAB,
+ },
+
+ {
+ .name = "memory_pressure_enabled",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE_ENABLED,
+ },
+
+ { } /* terminate */
+};
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 40ec4abaf440..a4dd285cdf39 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -22,11 +22,8 @@
* distribution for more details.
*/
#include "cgroup-internal.h"
+#include "cpuset-internal.h"
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/cpuset.h>
-#include <linux/delay.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
@@ -40,10 +37,8 @@
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/security.h>
-#include <linux/spinlock.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
-#include <linux/cgroup.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
@@ -57,30 +52,6 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
*/
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
-/* See "Frequency meter" comments, below. */
-
-struct fmeter {
- int cnt; /* unprocessed events count */
- int val; /* most recent output value */
- time64_t time; /* clock (secs) when val computed */
- spinlock_t lock; /* guards read or write of above */
-};
-
-/*
- * Invalid partition error code
- */
-enum prs_errcode {
- PERR_NONE = 0,
- PERR_INVCPUS,
- PERR_INVPARENT,
- PERR_NOTPART,
- PERR_NOTEXCL,
- PERR_NOCPUS,
- PERR_HOTPLUG,
- PERR_CPUSEMPTY,
- PERR_HKEEPING,
-};
-
static const char * const perr_strings[] = {
[PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",
[PERR_INVPARENT] = "Parent is an invalid partition root",
@@ -90,133 +61,7 @@ static const char * const perr_strings[] = {
[PERR_HOTPLUG] = "No cpu available due to hotplug",
[PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
-};
-
-struct cpuset {
- struct cgroup_subsys_state css;
-
- unsigned long flags; /* "unsigned long" so bitops work */
-
- /*
- * On default hierarchy:
- *
- * The user-configured masks can only be changed by writing to
- * cpuset.cpus and cpuset.mems, and won't be limited by the
- * parent masks.
- *
- * The effective masks is the real masks that apply to the tasks
- * in the cpuset. They may be changed if the configured masks are
- * changed or hotplug happens.
- *
- * effective_mask == configured_mask & parent's effective_mask,
- * and if it ends up empty, it will inherit the parent's mask.
- *
- *
- * On legacy hierarchy:
- *
- * The user-configured masks are always the same with effective masks.
- */
-
- /* user-configured CPUs and Memory Nodes allow to tasks */
- cpumask_var_t cpus_allowed;
- nodemask_t mems_allowed;
-
- /* effective CPUs and Memory Nodes allow to tasks */
- cpumask_var_t effective_cpus;
- nodemask_t effective_mems;
-
- /*
- * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
- *
- * The effective_cpus of a valid partition root comes solely from its
- * effective_xcpus and some of the effective_xcpus may be distributed
- * to sub-partitions below & hence excluded from its effective_cpus.
- * For a valid partition root, its effective_cpus have no relationship
- * with cpus_allowed unless its exclusive_cpus isn't set.
- *
- * This value will only be set if either exclusive_cpus is set or
- * when this cpuset becomes a local partition root.
- */
- cpumask_var_t effective_xcpus;
-
- /*
- * Exclusive CPUs as requested by the user (default hierarchy only)
- *
- * Its value is independent of cpus_allowed and designates the set of
- * CPUs that can be granted to the current cpuset or its children when
- * it becomes a valid partition root. The effective set of exclusive
- * CPUs granted (effective_xcpus) depends on whether those exclusive
- * CPUs are passed down by its ancestors and not yet taken up by
- * another sibling partition root along the way.
- *
- * If its value isn't set, it defaults to cpus_allowed.
- */
- cpumask_var_t exclusive_cpus;
-
- /*
- * This is old Memory Nodes tasks took on.
- *
- * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
- * - A new cpuset's old_mems_allowed is initialized when some
- * task is moved into it.
- * - old_mems_allowed is used in cpuset_migrate_mm() when we change
- * cpuset.mems_allowed and have tasks' nodemask updated, and
- * then old_mems_allowed is updated to mems_allowed.
- */
- nodemask_t old_mems_allowed;
-
- struct fmeter fmeter; /* memory_pressure filter */
-
- /*
- * Tasks are being attached to this cpuset. Used to prevent
- * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
- */
- int attach_in_progress;
-
- /* partition number for rebuild_sched_domains() */
- int pn;
-
- /* for custom sched domain */
- int relax_domain_level;
-
- /* number of valid local child partitions */
- int nr_subparts;
-
- /* partition root state */
- int partition_root_state;
-
- /*
- * Default hierarchy only:
- * use_parent_ecpus - set if using parent's effective_cpus
- * child_ecpus_count - # of children with use_parent_ecpus set
- */
- int use_parent_ecpus;
- int child_ecpus_count;
-
- /*
- * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
- * know when to rebuild associated root domain bandwidth information.
- */
- int nr_deadline_tasks;
- int nr_migrate_dl_tasks;
- u64 sum_migrate_dl_bw;
-
- /* Invalid partition error code, not lock protected */
- enum prs_errcode prs_err;
-
- /* Handle for cpuset.cpus.partition */
- struct cgroup_file partition_file;
-
- /* Remote partition silbling list anchored at remote_children */
- struct list_head remote_sibling;
-};
-
-/*
- * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
- */
-struct cpuset_remove_tasks_struct {
- struct work_struct work;
- struct cpuset *cs;
+ [PERR_ACCESS] = "Enable partition not permitted",
};
/*
@@ -229,10 +74,23 @@ static cpumask_var_t subpartitions_cpus;
*/
static cpumask_var_t isolated_cpus;
+/*
+ * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot
+ */
+static cpumask_var_t boot_hk_cpus;
+static bool have_boot_isolcpus;
+
/* List of remote partition root children */
static struct list_head remote_children;
/*
+ * A flag to force sched domain rebuild at the end of an operation while
+ * inhibiting it in the intermediate stages when set. Currently it is only
+ * set in hotplug code.
+ */
+static bool force_sd_rebuild;
+
+/*
* Partition root states:
*
* 0 - member (not a partition root)
@@ -272,22 +130,6 @@ struct tmpmasks {
cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
};
-static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
-{
- return css ? container_of(css, struct cpuset, css) : NULL;
-}
-
-/* Retrieve the cpuset for a task */
-static inline struct cpuset *task_cs(struct task_struct *task)
-{
- return css_cs(task_css(task, cpuset_cgrp_id));
-}
-
-static inline struct cpuset *parent_cs(struct cpuset *cs)
-{
- return css_cs(cs->css.parent);
-}
-
void inc_dl_tasks_cs(struct task_struct *p)
{
struct cpuset *cs = task_cs(p);
@@ -302,59 +144,6 @@ void dec_dl_tasks_cs(struct task_struct *p)
cs->nr_deadline_tasks--;
}
-/* bits in struct cpuset flags field */
-typedef enum {
- CS_ONLINE,
- CS_CPU_EXCLUSIVE,
- CS_MEM_EXCLUSIVE,
- CS_MEM_HARDWALL,
- CS_MEMORY_MIGRATE,
- CS_SCHED_LOAD_BALANCE,
- CS_SPREAD_PAGE,
- CS_SPREAD_SLAB,
-} cpuset_flagbits_t;
-
-/* convenient tests for these bits */
-static inline bool is_cpuset_online(struct cpuset *cs)
-{
- return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
-}
-
-static inline int is_cpu_exclusive(const struct cpuset *cs)
-{
- return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
-}
-
-static inline int is_mem_exclusive(const struct cpuset *cs)
-{
- return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
-}
-
-static inline int is_mem_hardwall(const struct cpuset *cs)
-{
- return test_bit(CS_MEM_HARDWALL, &cs->flags);
-}
-
-static inline int is_sched_load_balance(const struct cpuset *cs)
-{
- return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-}
-
-static inline int is_memory_migrate(const struct cpuset *cs)
-{
- return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
-}
-
-static inline int is_spread_page(const struct cpuset *cs)
-{
- return test_bit(CS_SPREAD_PAGE, &cs->flags);
-}
-
-static inline int is_spread_slab(const struct cpuset *cs)
-{
- return test_bit(CS_SPREAD_SLAB, &cs->flags);
-}
-
static inline int is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
@@ -396,34 +185,6 @@ static struct cpuset top_cpuset = {
.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
};
-/**
- * cpuset_for_each_child - traverse online children of a cpuset
- * @child_cs: loop cursor pointing to the current child
- * @pos_css: used for iteration
- * @parent_cs: target cpuset to walk children of
- *
- * Walk @child_cs through the online children of @parent_cs. Must be used
- * with RCU read locked.
- */
-#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
- css_for_each_child((pos_css), &(parent_cs)->css) \
- if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
-
-/**
- * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
- * @des_cs: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @root_cs: target cpuset to walk ancestor of
- *
- * Walk @des_cs through the online descendants of @root_cs. Must be used
- * with RCU read locked. The caller may modify @pos_css by calling
- * css_rightmost_descendant() to skip subtree. @root_cs is included in the
- * iteration and the first node to be visited.
- */
-#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
- css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
- if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
-
/*
* There are two global locks guarding cpuset structures - cpuset_mutex and
* callback_lock. We also require taking task_lock() when dereferencing a
@@ -477,6 +238,16 @@ void cpuset_unlock(void)
static DEFINE_SPINLOCK(callback_lock);
+void cpuset_callback_lock_irq(void)
+{
+ spin_lock_irq(&callback_lock);
+}
+
+void cpuset_callback_unlock_irq(void)
+{
+ spin_unlock_irq(&callback_lock);
+}
+
static struct workqueue_struct *cpuset_migrate_mm_wq;
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
@@ -493,6 +264,26 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
}
/*
+ * decrease cs->attach_in_progress.
+ * wake_up cpuset_attach_wq if cs->attach_in_progress==0.
+ */
+static inline void dec_attach_in_progress_locked(struct cpuset *cs)
+{
+ lockdep_assert_held(&cpuset_mutex);
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+}
+
+static inline void dec_attach_in_progress(struct cpuset *cs)
+{
+ mutex_lock(&cpuset_mutex);
+ dec_attach_in_progress_locked(cs);
+ mutex_unlock(&cpuset_mutex);
+}
+
+/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
@@ -589,45 +380,6 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
-/*
- * update task's spread flag if cpuset's page/slab spread flag is set
- *
- * Call with callback_lock or cpuset_mutex held. The check can be skipped
- * if on default hierarchy.
- */
-static void cpuset_update_task_spread_flags(struct cpuset *cs,
- struct task_struct *tsk)
-{
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
- return;
-
- if (is_spread_page(cs))
- task_set_spread_page(tsk);
- else
- task_clear_spread_page(tsk);
-
- if (is_spread_slab(cs))
- task_set_spread_slab(tsk);
- else
- task_clear_spread_slab(tsk);
-}
-
-/*
- * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
- *
- * One cpuset is a subset of another if all its allowed CPUs and
- * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_mutex.
- */
-
-static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
-{
- return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
- nodes_subset(p->mems_allowed, q->mems_allowed) &&
- is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
- is_mem_exclusive(p) <= is_mem_exclusive(q);
-}
-
/**
* alloc_cpumasks - allocate three cpumasks for cpuset
* @cs: the cpuset that have cpumasks to be allocated.
@@ -743,13 +495,6 @@ static inline bool xcpus_empty(struct cpuset *cs)
cpumask_empty(cs->exclusive_cpus);
}
-static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
-{
- return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
- cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed
- : cs->effective_xcpus;
-}
-
/*
* cpusets_are_exclusive() - check if two cpusets are exclusive
*
@@ -757,8 +502,8 @@ static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
*/
static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
{
- struct cpumask *xcpus1 = fetch_xcpus(cs1);
- struct cpumask *xcpus2 = fetch_xcpus(cs2);
+ struct cpumask *xcpus1 = user_xcpus(cs1);
+ struct cpumask *xcpus2 = user_xcpus(cs2);
if (cpumask_intersects(xcpus1, xcpus2))
return false;
@@ -766,35 +511,6 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
}
/*
- * validate_change_legacy() - Validate conditions specific to legacy (v1)
- * behavior.
- */
-static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
-{
- struct cgroup_subsys_state *css;
- struct cpuset *c, *par;
- int ret;
-
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- /* Each of our child cpusets must be a subset of us */
- ret = -EBUSY;
- cpuset_for_each_child(c, css, cur)
- if (!is_cpuset_subset(c, trial))
- goto out;
-
- /* On legacy hierarchy, we must be a subset of our parent cpuset. */
- ret = -EACCES;
- par = parent_cs(cur);
- if (par && !is_cpuset_subset(trial, par))
- goto out;
-
- ret = 0;
-out:
- return ret;
-}
-
-/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
*
@@ -823,7 +539,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
rcu_read_lock();
if (!is_in_v2_mode())
- ret = validate_change_legacy(cur, trial);
+ ret = cpuset1_validate_change(cur, trial);
if (ret)
goto out;
@@ -989,18 +705,15 @@ static inline int nr_cpusets(void)
* were changed (added or removed.)
*
* Finding the best partition (set of domains):
- * The triple nested loops below over i, j, k scan over the
- * load balanced cpusets (using the array of cpuset pointers in
- * csa[]) looking for pairs of cpusets that have overlapping
- * cpus_allowed, but which don't have the same 'pn' partition
- * number and gives them in the same partition number. It keeps
- * looping on the 'restart' label until it can no longer find
- * any such pairs.
+ * The double nested loops below over i, j scan over the load
+ * balanced cpusets (using the array of cpuset pointers in csa[])
+ * looking for pairs of cpusets that have overlapping cpus_allowed
+ * and merging them using a union-find algorithm.
+ *
+ * The union of the cpus_allowed masks from the set of all cpusets
+ * having the same root then form the one element of the partition
+ * (one sched domain) to be passed to partition_sched_domains().
*
- * The union of the cpus_allowed masks from the set of
- * all cpusets having the same 'pn' value then form the one
- * element of the partition (one sched domain) to be passed to
- * partition_sched_domains().
*/
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
@@ -1008,7 +721,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
- int i, j, k; /* indices for partition finding loops */
+ int i, j; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
@@ -1016,6 +729,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct cgroup_subsys_state *pos_css;
bool root_load_balance = is_sched_load_balance(&top_cpuset);
bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+ int nslot_update;
doms = NULL;
dattr = NULL;
@@ -1104,32 +818,28 @@ v2:
goto single_root_domain;
for (i = 0; i < csn; i++)
- csa[i]->pn = i;
- ndoms = csn;
+ uf_node_init(&csa[i]->node);
-restart:
- /* Find the best partition (set of sched domains) */
+ /* Merge overlapping cpusets */
for (i = 0; i < csn; i++) {
- struct cpuset *a = csa[i];
- int apn = a->pn;
-
- for (j = 0; j < csn; j++) {
- struct cpuset *b = csa[j];
- int bpn = b->pn;
-
- if (apn != bpn && cpusets_overlap(a, b)) {
- for (k = 0; k < csn; k++) {
- struct cpuset *c = csa[k];
-
- if (c->pn == bpn)
- c->pn = apn;
- }
- ndoms--; /* one less element */
- goto restart;
+ for (j = i + 1; j < csn; j++) {
+ if (cpusets_overlap(csa[i], csa[j])) {
+ /*
+ * Cgroup v2 shouldn't pass down overlapping
+ * partition root cpusets.
+ */
+ WARN_ON_ONCE(cgrpv2);
+ uf_union(&csa[i]->node, &csa[j]->node);
}
}
}
+ /* Count the total number of domains */
+ for (i = 0; i < csn; i++) {
+ if (uf_find(&csa[i]->node) == &csa[i]->node)
+ ndoms++;
+ }
+
/*
* Now we know how many domains to create.
* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
@@ -1160,44 +870,25 @@ restart:
}
for (nslot = 0, i = 0; i < csn; i++) {
- struct cpuset *a = csa[i];
- struct cpumask *dp;
- int apn = a->pn;
-
- if (apn < 0) {
- /* Skip completed partitions */
- continue;
- }
-
- dp = doms[nslot];
-
- if (nslot == ndoms) {
- static int warnings = 10;
- if (warnings) {
- pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
- nslot, ndoms, csn, i, apn);
- warnings--;
- }
- continue;
- }
-
- cpumask_clear(dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
+ nslot_update = 0;
for (j = i; j < csn; j++) {
- struct cpuset *b = csa[j];
-
- if (apn == b->pn) {
- cpumask_or(dp, dp, b->effective_cpus);
+ if (uf_find(&csa[j]->node) == &csa[i]->node) {
+ struct cpumask *dp = doms[nslot];
+
+ if (i == j) {
+ nslot_update = 1;
+ cpumask_clear(dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ }
+ cpumask_or(dp, dp, csa[j]->effective_cpus);
cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (dattr)
- update_domain_attr_tree(dattr + nslot, b);
-
- /* Done with this partition */
- b->pn = -1;
+ update_domain_attr_tree(dattr + nslot, csa[j]);
}
}
- nslot++;
+ if (nslot_update)
+ nslot++;
}
BUG_ON(nslot != ndoms);
@@ -1289,7 +980,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
*
* Call with cpuset_mutex held. Takes cpus_read_lock().
*/
-static void rebuild_sched_domains_locked(void)
+void rebuild_sched_domains_locked(void)
{
struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
@@ -1341,7 +1032,7 @@ static void rebuild_sched_domains_locked(void)
partition_and_rebuild_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+void rebuild_sched_domains_locked(void)
{
}
#endif /* CONFIG_SMP */
@@ -1361,7 +1052,7 @@ void rebuild_sched_domains(void)
}
/**
- * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
* @new_cpus: the temp variable for the new effective_cpus mask
*
@@ -1371,7 +1062,7 @@ void rebuild_sched_domains(void)
* is used instead of effective_cpus to make sure all offline CPUs are also
* included as hotplug code won't update cpumasks for tasks in top_cpuset.
*/
-static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
+void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
struct css_task_iter it;
struct task_struct *task;
@@ -1421,8 +1112,6 @@ enum partition_cmd {
partcmd_invalidate, /* Make partition invalid */
};
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
- int turning_on);
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct tmpmasks *tmp);
@@ -1436,11 +1125,11 @@ static int update_partition_exclusive(struct cpuset *cs, int new_prs)
bool exclusive = (new_prs > PRS_MEMBER);
if (exclusive && !is_cpu_exclusive(cs)) {
- if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
+ if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1))
return PERR_NOTEXCL;
} else if (!exclusive && is_cpu_exclusive(cs)) {
/* Turning off CS_CPU_EXCLUSIVE will not return error */
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0);
}
return 0;
}
@@ -1475,7 +1164,7 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
- if (rebuild_domains)
+ if (rebuild_domains && !force_sd_rebuild)
rebuild_sched_domains_locked();
}
@@ -1509,12 +1198,8 @@ static void reset_partition_data(struct cpuset *cs)
if (is_cpu_exclusive(cs))
clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}
- if (!cpumask_and(cs->effective_cpus,
- parent->effective_cpus, cs->cpus_allowed)) {
- cs->use_parent_ecpus = true;
- parent->child_ecpus_count++;
+ if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed))
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
- }
}
/*
@@ -1655,7 +1340,7 @@ static inline bool is_local_partition(struct cpuset *cs)
* @cs: the cpuset to update
* @new_prs: new partition_root_state
* @tmp: temparary masks
- * Return: 1 if successful, 0 if error
+ * Return: 0 if successful, errcode if error
*
* Enable the current cpuset to become a remote partition root taking CPUs
* directly from the top cpuset. cpuset_mutex must be held by the caller.
@@ -1669,7 +1354,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
* The user must have sysadmin privilege.
*/
if (!capable(CAP_SYS_ADMIN))
- return 0;
+ return PERR_ACCESS;
/*
* The requested exclusive_cpus must not be allocated to other
@@ -1683,26 +1368,20 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
if (cpumask_empty(tmp->new_cpus) ||
cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
- return 0;
+ return PERR_INVCPUS;
spin_lock_irq(&callback_lock);
isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
list_add(&cs->remote_sibling, &remote_children);
- if (cs->use_parent_ecpus) {
- struct cpuset *parent = parent_cs(cs);
-
- cs->use_parent_ecpus = false;
- parent->child_ecpus_count--;
- }
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
*/
- update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
- return 1;
+ return 0;
}
/*
@@ -1736,7 +1415,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
*/
- update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
}
@@ -1788,7 +1467,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
*/
- update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
return;
@@ -1833,7 +1512,7 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
remote_partition_disable(child, tmp);
disable_cnt++;
}
- if (disable_cnt)
+ if (disable_cnt && !force_sd_rebuild)
rebuild_sched_domains_locked();
}
@@ -1843,15 +1522,15 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
* @new_cpus: cpu mask
* Return: true if there is conflict, false otherwise
*
- * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in
- * an isolated partition.
+ * CPUs outside of boot_hk_cpus, if defined, can only be used in an
+ * isolated partition.
*/
static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
{
- const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN);
- bool all_in_hk = cpumask_subset(new_cpus, hk_domain);
+ if (!have_boot_isolcpus)
+ return false;
- if (!all_in_hk && (prstate != PRS_ISOLATED))
+ if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))
return true;
return false;
@@ -1991,6 +1670,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
part_error = PERR_CPUSEMPTY;
goto write_error;
}
+ /* Check newmask again, whether cpus are available for parent/cs */
+ nocpu |= tasks_nocpu_error(parent, cs, newmask);
/*
* partcmd_update with newmask:
@@ -2158,7 +1839,7 @@ write_error:
update_partition_exclusive(cs, new_prs);
if (adding || deleting) {
- update_tasks_cpumask(parent, tmp->addmask);
+ cpuset_update_tasks_cpumask(parent, tmp->addmask);
update_sibling_cpumasks(parent, cs, tmp);
}
@@ -2316,17 +1997,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
* it is a partition root that has explicitly distributed
* out all its CPUs.
*/
- if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) {
+ if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
- if (!cp->use_parent_ecpus) {
- cp->use_parent_ecpus = true;
- parent->child_ecpus_count++;
- }
- } else if (cp->use_parent_ecpus) {
- cp->use_parent_ecpus = false;
- WARN_ON_ONCE(!parent->child_ecpus_count);
- parent->child_ecpus_count--;
- }
if (remote)
goto get_css;
@@ -2350,7 +2022,7 @@ update_parent_effective:
/*
* update_parent_effective_cpumask() should have been called
* for cs already in update_cpumask(). We should also call
- * update_tasks_cpumask() again for tasks in the parent
+ * cpuset_update_tasks_cpumask() again for tasks in the parent
* cpuset if the parent's effective_cpus changes.
*/
if ((cp != cs) && old_prs) {
@@ -2407,7 +2079,7 @@ get_css:
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- update_tasks_cpumask(cp, cp->effective_cpus);
+ cpuset_update_tasks_cpumask(cp, cp->effective_cpus);
/*
* On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
@@ -2440,7 +2112,8 @@ get_css:
}
rcu_read_unlock();
- if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
+ if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) &&
+ !force_sd_rebuild)
rebuild_sched_domains_locked();
}
@@ -2462,8 +2135,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* Check all its siblings and call update_cpumasks_hier()
* if their effective_cpus will need to be changed.
*
- * With the addition of effective_xcpus which is a subset of
- * cpus_allowed. It is possible a change in parent's effective_cpus
+ * It is possible a change in parent's effective_cpus
* due to a change in a child partition's effective_xcpus will impact
* its siblings even if they do not inherit parent's effective_cpus
* directly.
@@ -2477,8 +2149,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
cpuset_for_each_child(sibling, pos_css, parent) {
if (sibling == cs)
continue;
- if (!sibling->use_parent_ecpus &&
- !is_partition_valid(sibling)) {
+ if (!is_partition_valid(sibling)) {
compute_effective_cpumask(tmp->new_cpus, sibling,
parent);
if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
@@ -2523,7 +2194,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
*/
if (!*buf) {
cpumask_clear(trialcs->cpus_allowed);
- cpumask_clear(trialcs->effective_xcpus);
+ if (cpumask_empty(trialcs->exclusive_cpus))
+ cpumask_clear(trialcs->effective_xcpus);
} else {
retval = cpulist_parse(buf, trialcs->cpus_allowed);
if (retval < 0)
@@ -2587,7 +2259,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
invalidate = true;
rcu_read_lock();
cpuset_for_each_child(cp, css, parent) {
- struct cpumask *xcpus = fetch_xcpus(trialcs);
+ struct cpumask *xcpus = user_xcpus(trialcs);
if (is_partition_valid(cp) &&
cpumask_intersects(xcpus, cp->effective_xcpus)) {
@@ -2834,14 +2506,14 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
static void *cpuset_being_rebound;
/**
- * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
* effective cpuset's. As this function is called with cpuset_mutex held,
* cpuset membership stays stable.
*/
-static void update_tasks_nodemask(struct cpuset *cs)
+void cpuset_update_tasks_nodemask(struct cpuset *cs)
{
static nodemask_t newmems; /* protected by cpuset_mutex */
struct css_task_iter it;
@@ -2939,7 +2611,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
- update_tasks_nodemask(cp);
+ cpuset_update_tasks_nodemask(cp);
rcu_read_lock();
css_put(&cp->css);
@@ -3025,44 +2697,8 @@ bool current_cpuset_is_being_rebound(void)
return ret;
}
-static int update_relax_domain_level(struct cpuset *cs, s64 val)
-{
-#ifdef CONFIG_SMP
- if (val < -1 || val > sched_domain_level_max + 1)
- return -EINVAL;
-#endif
-
- if (val != cs->relax_domain_level) {
- cs->relax_domain_level = val;
- if (!cpumask_empty(cs->cpus_allowed) &&
- is_sched_load_balance(cs))
- rebuild_sched_domains_locked();
- }
-
- return 0;
-}
-
-/**
- * update_tasks_flags - update the spread flags of tasks in the cpuset.
- * @cs: the cpuset in which each task's spread flags needs to be changed
- *
- * Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_mutex held, cpuset membership stays
- * stable.
- */
-static void update_tasks_flags(struct cpuset *cs)
-{
- struct css_task_iter it;
- struct task_struct *task;
-
- css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
- cpuset_update_task_spread_flags(cs, task);
- css_task_iter_end(&it);
-}
-
/*
- * update_flag - read a 0 or a 1 in a file and update associated flag
+ * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (see cpuset_flagbits_t)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
@@ -3070,7 +2706,7 @@ static void update_tasks_flags(struct cpuset *cs)
* Call with cpuset_mutex held.
*/
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on)
{
struct cpuset *trialcs;
@@ -3101,11 +2737,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
cs->flags = trialcs->flags;
spin_unlock_irq(&callback_lock);
- if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
+ if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed &&
+ !force_sd_rebuild)
rebuild_sched_domains_locked();
if (spread_flag_changed)
- update_tasks_flags(cs);
+ cpuset1_update_tasks_flags(cs);
out:
free_cpuset(trialcs);
return err;
@@ -3154,9 +2791,6 @@ static int update_prstate(struct cpuset *cs, int new_prs)
goto out;
if (!old_prs) {
- enum partition_cmd cmd = (new_prs == PRS_ROOT)
- ? partcmd_enable : partcmd_enablei;
-
/*
* cpus_allowed and exclusive_cpus cannot be both empty.
*/
@@ -3165,13 +2799,18 @@ static int update_prstate(struct cpuset *cs, int new_prs)
goto out;
}
- err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
/*
- * If an attempt to become local partition root fails,
- * try to become a remote partition root instead.
+ * If parent is valid partition, enable local partiion.
+ * Otherwise, enable a remote partition.
*/
- if (err && remote_partition_enable(cs, new_prs, &tmpmask))
- err = 0;
+ if (is_partition_valid(parent)) {
+ enum partition_cmd cmd = (new_prs == PRS_ROOT)
+ ? partcmd_enable : partcmd_enablei;
+
+ err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
+ } else {
+ err = remote_partition_enable(cs, new_prs, &tmpmask);
+ }
} else if (old_prs && new_prs) {
/*
* A change in load balance state only, no change in cpumasks.
@@ -3224,107 +2863,6 @@ out:
return 0;
}
-/*
- * Frequency meter - How fast is some event occurring?
- *
- * These routines manage a digitally filtered, constant time based,
- * event frequency meter. There are four routines:
- * fmeter_init() - initialize a frequency meter.
- * fmeter_markevent() - called each time the event happens.
- * fmeter_getrate() - returns the recent rate of such events.
- * fmeter_update() - internal routine used to update fmeter.
- *
- * A common data structure is passed to each of these routines,
- * which is used to keep track of the state required to manage the
- * frequency meter and its digital filter.
- *
- * The filter works on the number of events marked per unit time.
- * The filter is single-pole low-pass recursive (IIR). The time unit
- * is 1 second. Arithmetic is done using 32-bit integers scaled to
- * simulate 3 decimal digits of precision (multiplied by 1000).
- *
- * With an FM_COEF of 933, and a time base of 1 second, the filter
- * has a half-life of 10 seconds, meaning that if the events quit
- * happening, then the rate returned from the fmeter_getrate()
- * will be cut in half each 10 seconds, until it converges to zero.
- *
- * It is not worth doing a real infinitely recursive filter. If more
- * than FM_MAXTICKS ticks have elapsed since the last filter event,
- * just compute FM_MAXTICKS ticks worth, by which point the level
- * will be stable.
- *
- * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
- * arithmetic overflow in the fmeter_update() routine.
- *
- * Given the simple 32 bit integer arithmetic used, this meter works
- * best for reporting rates between one per millisecond (msec) and
- * one per 32 (approx) seconds. At constant rates faster than one
- * per msec it maxes out at values just under 1,000,000. At constant
- * rates between one per msec, and one per second it will stabilize
- * to a value N*1000, where N is the rate of events per second.
- * At constant rates between one per second and one per 32 seconds,
- * it will be choppy, moving up on the seconds that have an event,
- * and then decaying until the next event. At rates slower than
- * about one in 32 seconds, it decays all the way back to zero between
- * each event.
- */
-
-#define FM_COEF 933 /* coefficient for half-life of 10 secs */
-#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
-#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
-#define FM_SCALE 1000 /* faux fixed point scale */
-
-/* Initialize a frequency meter */
-static void fmeter_init(struct fmeter *fmp)
-{
- fmp->cnt = 0;
- fmp->val = 0;
- fmp->time = 0;
- spin_lock_init(&fmp->lock);
-}
-
-/* Internal meter update - process cnt events and update value */
-static void fmeter_update(struct fmeter *fmp)
-{
- time64_t now;
- u32 ticks;
-
- now = ktime_get_seconds();
- ticks = now - fmp->time;
-
- if (ticks == 0)
- return;
-
- ticks = min(FM_MAXTICKS, ticks);
- while (ticks-- > 0)
- fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
- fmp->time = now;
-
- fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
- fmp->cnt = 0;
-}
-
-/* Process any previous ticks, then bump cnt by one (times scale). */
-static void fmeter_markevent(struct fmeter *fmp)
-{
- spin_lock(&fmp->lock);
- fmeter_update(fmp);
- fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
- spin_unlock(&fmp->lock);
-}
-
-/* Process any previous ticks, then return current value. */
-static int fmeter_getrate(struct fmeter *fmp)
-{
- int val;
-
- spin_lock(&fmp->lock);
- fmeter_update(fmp);
- val = fmp->val;
- spin_unlock(&fmp->lock);
- return val;
-}
-
static struct cpuset *cpuset_attach_old_cs;
/*
@@ -3433,9 +2971,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
cs = css_cs(css);
mutex_lock(&cpuset_mutex);
- cs->attach_in_progress--;
- if (!cs->attach_in_progress)
- wake_up(&cpuset_attach_wq);
+ dec_attach_in_progress_locked(cs);
if (cs->nr_migrate_dl_tasks) {
int cpu = cpumask_any(cs->effective_cpus);
@@ -3471,7 +3007,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flags(cs, task);
+ cpuset1_update_task_spread_flags(cs, task);
}
static void cpuset_attach(struct cgroup_taskset *tset)
@@ -3550,116 +3086,15 @@ out:
reset_migrate_dl_data(cs);
}
- cs->attach_in_progress--;
- if (!cs->attach_in_progress)
- wake_up(&cpuset_attach_wq);
+ dec_attach_in_progress_locked(cs);
mutex_unlock(&cpuset_mutex);
}
-/* The various types of files and directories in a cpuset file system */
-
-typedef enum {
- FILE_MEMORY_MIGRATE,
- FILE_CPULIST,
- FILE_MEMLIST,
- FILE_EFFECTIVE_CPULIST,
- FILE_EFFECTIVE_MEMLIST,
- FILE_SUBPARTS_CPULIST,
- FILE_EXCLUSIVE_CPULIST,
- FILE_EFFECTIVE_XCPULIST,
- FILE_ISOLATED_CPULIST,
- FILE_CPU_EXCLUSIVE,
- FILE_MEM_EXCLUSIVE,
- FILE_MEM_HARDWALL,
- FILE_SCHED_LOAD_BALANCE,
- FILE_PARTITION_ROOT,
- FILE_SCHED_RELAX_DOMAIN_LEVEL,
- FILE_MEMORY_PRESSURE_ENABLED,
- FILE_MEMORY_PRESSURE,
- FILE_SPREAD_PAGE,
- FILE_SPREAD_SLAB,
-} cpuset_filetype_t;
-
-static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 val)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- int retval = 0;
-
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
- if (!is_cpuset_online(cs)) {
- retval = -ENODEV;
- goto out_unlock;
- }
-
- switch (type) {
- case FILE_CPU_EXCLUSIVE:
- retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
- break;
- case FILE_MEM_EXCLUSIVE:
- retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
- break;
- case FILE_MEM_HARDWALL:
- retval = update_flag(CS_MEM_HARDWALL, cs, val);
- break;
- case FILE_SCHED_LOAD_BALANCE:
- retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
- break;
- case FILE_MEMORY_MIGRATE:
- retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
- break;
- case FILE_MEMORY_PRESSURE_ENABLED:
- cpuset_memory_pressure_enabled = !!val;
- break;
- case FILE_SPREAD_PAGE:
- retval = update_flag(CS_SPREAD_PAGE, cs, val);
- break;
- case FILE_SPREAD_SLAB:
- retval = update_flag(CS_SPREAD_SLAB, cs, val);
- break;
- default:
- retval = -EINVAL;
- break;
- }
-out_unlock:
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
- return retval;
-}
-
-static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
- s64 val)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- int retval = -ENODEV;
-
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
- if (!is_cpuset_online(cs))
- goto out_unlock;
-
- switch (type) {
- case FILE_SCHED_RELAX_DOMAIN_LEVEL:
- retval = update_relax_domain_level(cs, val);
- break;
- default:
- retval = -EINVAL;
- break;
- }
-out_unlock:
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
- return retval;
-}
-
/*
* Common handling for a write to a "cpus" or "mems" file.
*/
-static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cpuset *cs = css_cs(of_css(of));
@@ -3734,7 +3169,7 @@ out_unlock:
* and since these maps can change value dynamically, one could read
* gibberish by doing partial reads while a list was changing.
*/
-static int cpuset_common_seq_show(struct seq_file *sf, void *v)
+int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
struct cpuset *cs = css_cs(seq_css(sf));
cpuset_filetype_t type = seq_cft(sf)->private;
@@ -3775,52 +3210,6 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
return ret;
}
-static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- switch (type) {
- case FILE_CPU_EXCLUSIVE:
- return is_cpu_exclusive(cs);
- case FILE_MEM_EXCLUSIVE:
- return is_mem_exclusive(cs);
- case FILE_MEM_HARDWALL:
- return is_mem_hardwall(cs);
- case FILE_SCHED_LOAD_BALANCE:
- return is_sched_load_balance(cs);
- case FILE_MEMORY_MIGRATE:
- return is_memory_migrate(cs);
- case FILE_MEMORY_PRESSURE_ENABLED:
- return cpuset_memory_pressure_enabled;
- case FILE_MEMORY_PRESSURE:
- return fmeter_getrate(&cs->fmeter);
- case FILE_SPREAD_PAGE:
- return is_spread_page(cs);
- case FILE_SPREAD_SLAB:
- return is_spread_slab(cs);
- default:
- BUG();
- }
-
- /* Unreachable but makes gcc happy */
- return 0;
-}
-
-static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- switch (type) {
- case FILE_SCHED_RELAX_DOMAIN_LEVEL:
- return cs->relax_domain_level;
- default:
- BUG();
- }
-
- /* Unreachable but makes gcc happy */
- return 0;
-}
-
static int sched_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
@@ -3885,113 +3274,6 @@ out_unlock:
}
/*
- * for the common functions, 'private' gives the type of file
- */
-
-static struct cftype legacy_files[] = {
- {
- .name = "cpus",
- .seq_show = cpuset_common_seq_show,
- .write = cpuset_write_resmask,
- .max_write_len = (100U + 6 * NR_CPUS),
- .private = FILE_CPULIST,
- },
-
- {
- .name = "mems",
- .seq_show = cpuset_common_seq_show,
- .write = cpuset_write_resmask,
- .max_write_len = (100U + 6 * MAX_NUMNODES),
- .private = FILE_MEMLIST,
- },
-
- {
- .name = "effective_cpus",
- .seq_show = cpuset_common_seq_show,
- .private = FILE_EFFECTIVE_CPULIST,
- },
-
- {
- .name = "effective_mems",
- .seq_show = cpuset_common_seq_show,
- .private = FILE_EFFECTIVE_MEMLIST,
- },
-
- {
- .name = "cpu_exclusive",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_CPU_EXCLUSIVE,
- },
-
- {
- .name = "mem_exclusive",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEM_EXCLUSIVE,
- },
-
- {
- .name = "mem_hardwall",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEM_HARDWALL,
- },
-
- {
- .name = "sched_load_balance",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SCHED_LOAD_BALANCE,
- },
-
- {
- .name = "sched_relax_domain_level",
- .read_s64 = cpuset_read_s64,
- .write_s64 = cpuset_write_s64,
- .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
- },
-
- {
- .name = "memory_migrate",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_MIGRATE,
- },
-
- {
- .name = "memory_pressure",
- .read_u64 = cpuset_read_u64,
- .private = FILE_MEMORY_PRESSURE,
- },
-
- {
- .name = "memory_spread_page",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SPREAD_PAGE,
- },
-
- {
- /* obsolete, may be removed in the future */
- .name = "memory_spread_slab",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SPREAD_SLAB,
- },
-
- {
- .name = "memory_pressure_enabled",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE_ENABLED,
- },
-
- { } /* terminate */
-};
-
-/*
* This is currently a minimal set for the default hierarchy. It can be
* expanded later on by migrating more features and control files from v1.
*/
@@ -4138,8 +3420,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
- cs->use_parent_ecpus = true;
- parent->child_ecpus_count++;
}
spin_unlock_irq(&callback_lock);
@@ -4203,14 +3483,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
is_sched_load_balance(cs))
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-
- if (cs->use_parent_ecpus) {
- struct cpuset *parent = parent_cs(cs);
-
- cs->use_parent_ecpus = false;
- parent->child_ecpus_count--;
- }
+ cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
@@ -4300,11 +3573,7 @@ static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
if (same_cs)
return;
- mutex_lock(&cpuset_mutex);
- cs->attach_in_progress--;
- if (!cs->attach_in_progress)
- wake_up(&cpuset_attach_wq);
- mutex_unlock(&cpuset_mutex);
+ dec_attach_in_progress(cs);
}
/*
@@ -4336,10 +3605,7 @@ static void cpuset_fork(struct task_struct *task)
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cpuset_attach_task(cs, task);
- cs->attach_in_progress--;
- if (!cs->attach_in_progress)
- wake_up(&cpuset_attach_wq);
-
+ dec_attach_in_progress_locked(cs);
mutex_unlock(&cpuset_mutex);
}
@@ -4356,7 +3622,9 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.can_fork = cpuset_can_fork,
.cancel_fork = cpuset_cancel_fork,
.fork = cpuset_fork,
- .legacy_cftypes = legacy_files,
+#ifdef CONFIG_CPUSETS_V1
+ .legacy_cftypes = cpuset1_files,
+#endif
.dfl_cftypes = dfl_files,
.early_init = true,
.threaded = true,
@@ -4389,91 +3657,14 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
- return 0;
-}
-
-/*
- * If CPU and/or memory hotplug handlers, below, unplug any CPUs
- * or memory nodes, we need to walk over the cpuset hierarchy,
- * removing that CPU or node from all cpusets. If this removes the
- * last CPU or node from a cpuset, then move the tasks in the empty
- * cpuset to its next-highest non-empty parent.
- */
-static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
-{
- struct cpuset *parent;
-
- /*
- * Find its next-highest non-empty parent, (top cpuset
- * has online cpus, so can't be empty).
- */
- parent = parent_cs(cs);
- while (cpumask_empty(parent->cpus_allowed) ||
- nodes_empty(parent->mems_allowed))
- parent = parent_cs(parent);
-
- if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
- pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
- pr_cont_cgroup_name(cs->css.cgroup);
- pr_cont("\n");
+ have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);
+ if (have_boot_isolcpus) {
+ BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));
+ cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);
}
-}
-
-static void cpuset_migrate_tasks_workfn(struct work_struct *work)
-{
- struct cpuset_remove_tasks_struct *s;
-
- s = container_of(work, struct cpuset_remove_tasks_struct, work);
- remove_tasks_in_empty_cpuset(s->cs);
- css_put(&s->cs->css);
- kfree(s);
-}
-
-static void
-hotplug_update_tasks_legacy(struct cpuset *cs,
- struct cpumask *new_cpus, nodemask_t *new_mems,
- bool cpus_updated, bool mems_updated)
-{
- bool is_empty;
-
- spin_lock_irq(&callback_lock);
- cpumask_copy(cs->cpus_allowed, new_cpus);
- cpumask_copy(cs->effective_cpus, new_cpus);
- cs->mems_allowed = *new_mems;
- cs->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
-
- /*
- * Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migrated to an ancestor.
- */
- if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
- update_tasks_cpumask(cs, new_cpus);
- if (mems_updated && !nodes_empty(cs->mems_allowed))
- update_tasks_nodemask(cs);
-
- is_empty = cpumask_empty(cs->cpus_allowed) ||
- nodes_empty(cs->mems_allowed);
-
- /*
- * Move tasks to the nearest ancestor with execution resources,
- * This is full cgroup operation which will also call back into
- * cpuset. Execute it asynchronously using workqueue.
- */
- if (is_empty && cs->css.cgroup->nr_populated_csets &&
- css_tryget_online(&cs->css)) {
- struct cpuset_remove_tasks_struct *s;
-
- s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (WARN_ON_ONCE(!s)) {
- css_put(&cs->css);
- return;
- }
- s->cs = cs;
- INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
- schedule_work(&s->work);
- }
+ return 0;
}
static void
@@ -4493,16 +3684,14 @@ hotplug_update_tasks(struct cpuset *cs,
spin_unlock_irq(&callback_lock);
if (cpus_updated)
- update_tasks_cpumask(cs, new_cpus);
+ cpuset_update_tasks_cpumask(cs, new_cpus);
if (mems_updated)
- update_tasks_nodemask(cs);
+ cpuset_update_tasks_nodemask(cs);
}
-static bool force_rebuild;
-
void cpuset_force_rebuild(void)
{
- force_rebuild = true;
+ force_sd_rebuild = true;
}
/**
@@ -4598,7 +3787,7 @@ update_tasks:
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
- hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+ cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
unlock:
@@ -4650,15 +3839,9 @@ static void cpuset_handle_hotplug(void)
!cpumask_empty(subpartitions_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
- /*
- * In the rare case that hotplug removes all the cpus in
- * subpartitions_cpus, we assumed that cpus are updated.
- */
- if (!cpus_updated && !cpumask_empty(subpartitions_cpus))
- cpus_updated = true;
-
/* For v1, synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
+ cpuset_force_rebuild();
spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
@@ -4689,7 +3872,7 @@ static void cpuset_handle_hotplug(void)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
spin_unlock_irq(&callback_lock);
- update_tasks_nodemask(&top_cpuset);
+ cpuset_update_tasks_nodemask(&top_cpuset);
}
mutex_unlock(&cpuset_mutex);
@@ -4714,8 +3897,8 @@ static void cpuset_handle_hotplug(void)
}
/* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated || force_rebuild) {
- force_rebuild = false;
+ if (force_sd_rebuild) {
+ force_sd_rebuild = false;
rebuild_sched_domains_cpuslocked();
}
@@ -5029,19 +4212,6 @@ int cpuset_mem_spread_node(void)
}
/**
- * cpuset_slab_spread_node() - On which node to begin search for a slab page
- */
-int cpuset_slab_spread_node(void)
-{
- if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
- current->cpuset_slab_spread_rotor =
- node_random(&current->mems_allowed);
-
- return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
-}
-EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
-
-/**
* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
* @tsk1: pointer to task_struct of some task.
* @tsk2: pointer to task_struct of some other task.
@@ -5079,39 +4249,6 @@ void cpuset_print_current_mems_allowed(void)
rcu_read_unlock();
}
-/*
- * Collection of memory_pressure is suppressed unless
- * this flag is enabled by writing "1" to the special
- * cpuset file 'memory_pressure_enabled' in the root cpuset.
- */
-
-int cpuset_memory_pressure_enabled __read_mostly;
-
-/*
- * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
- *
- * Keep a running average of the rate of synchronous (direct)
- * page reclaim efforts initiated by tasks in each cpuset.
- *
- * This represents the rate at which some task in the cpuset
- * ran low on memory on all nodes it was allowed to use, and
- * had to enter the kernels page reclaim code in an effort to
- * create more free memory by tossing clean pages or swapping
- * or writing dirty pages.
- *
- * Display to user space in the per-cpuset read-only file
- * "memory_pressure". Value displayed is an integer
- * representing the recent rate of entry into the synchronous
- * (direct) page reclaim by any task attached to the cpuset.
- */
-
-void __cpuset_memory_pressure_bump(void)
-{
- rcu_read_lock();
- fmeter_markevent(&task_cs(current)->fmeter);
- rcu_read_unlock();
-}
-
#ifdef CONFIG_PROC_PID_CPUSET
/*
* proc_cpuset_show()
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index f5cb0ec45b9d..8f61114c36dd 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -244,7 +244,6 @@ static void pids_event(struct pids_cgroup *pids_forking,
struct pids_cgroup *pids_over_limit)
{
struct pids_cgroup *p = pids_forking;
- bool limit = false;
/* Only log the first time limit is hit. */
if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) {
@@ -252,20 +251,17 @@ static void pids_event(struct pids_cgroup *pids_forking,
pr_cont_cgroup_path(p->css.cgroup);
pr_cont("\n");
}
- cgroup_file_notify(&p->events_local_file);
if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
- cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
+ cgroup_file_notify(&p->events_local_file);
return;
+ }
- for (; parent_pids(p); p = parent_pids(p)) {
- if (p == pids_over_limit) {
- limit = true;
- atomic64_inc(&p->events_local[PIDCG_MAX]);
- cgroup_file_notify(&p->events_local_file);
- }
- if (limit)
- atomic64_inc(&p->events[PIDCG_MAX]);
+ atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]);
+ cgroup_file_notify(&pids_over_limit->events_local_file);
+ for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) {
+ atomic64_inc(&p->events[PIDCG_MAX]);
cgroup_file_notify(&p->events_file);
}
}
@@ -276,15 +272,10 @@ static void pids_event(struct pids_cgroup *pids_forking,
*/
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
- struct cgroup_subsys_state *css;
struct pids_cgroup *pids, *pids_over_limit;
int err;
- if (cset)
- css = cset->subsys[pids_cgrp_id];
- else
- css = task_css_check(current, pids_cgrp_id, true);
- pids = css_pids(css);
+ pids = css_pids(cset->subsys[pids_cgrp_id]);
err = pids_try_charge(pids, 1, &pids_over_limit);
if (err)
pids_event(pids, pids_over_limit);
@@ -294,14 +285,9 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset)
static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
{
- struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
- if (cset)
- css = cset->subsys[pids_cgrp_id];
- else
- css = task_css_check(current, pids_cgrp_id, true);
- pids = css_pids(css);
+ pids = css_pids(cset->subsys[pids_cgrp_id]);
pids_uncharge(pids, 1);
}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 24b1e1143260..938c48952d26 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -28,34 +28,34 @@
DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
#ifdef CONFIG_CONTEXT_TRACKING_IDLE
- .dynticks_nesting = 1,
- .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
+ .nesting = 1,
+ .nmi_nesting = CT_NESTING_IRQ_NONIDLE,
#endif
- .state = ATOMIC_INIT(RCU_DYNTICKS_IDX),
+ .state = ATOMIC_INIT(CT_RCU_WATCHING),
};
EXPORT_SYMBOL_GPL(context_tracking);
#ifdef CONFIG_CONTEXT_TRACKING_IDLE
#define TPS(x) tracepoint_string(x)
-/* Record the current task on dyntick-idle entry. */
-static __always_inline void rcu_dynticks_task_enter(void)
+/* Record the current task on exiting RCU-tasks (dyntick-idle entry). */
+static __always_inline void rcu_task_exit(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
-/* Record no current task on dyntick-idle exit. */
-static __always_inline void rcu_dynticks_task_exit(void)
+/* Record no current task on entering RCU-tasks (dyntick-idle exit). */
+static __always_inline void rcu_task_enter(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
-/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
-static __always_inline void rcu_dynticks_task_trace_enter(void)
+/* Turn on heavyweight RCU tasks trace readers on kernel exit. */
+static __always_inline void rcu_task_trace_heavyweight_enter(void)
{
#ifdef CONFIG_TASKS_TRACE_RCU
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
@@ -63,8 +63,8 @@ static __always_inline void rcu_dynticks_task_trace_enter(void)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
-/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
-static __always_inline void rcu_dynticks_task_trace_exit(void)
+/* Turn off heavyweight RCU tasks trace readers on kernel entry. */
+static __always_inline void rcu_task_trace_heavyweight_exit(void)
{
#ifdef CONFIG_TASKS_TRACE_RCU
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
@@ -87,10 +87,10 @@ static noinstr void ct_kernel_exit_state(int offset)
* critical sections, and we also must force ordering with the
* next idle sojourn.
*/
- rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
+ rcu_task_trace_heavyweight_enter(); // Before CT state update!
seq = ct_state_inc(offset);
// RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING));
}
/*
@@ -109,15 +109,15 @@ static noinstr void ct_kernel_enter_state(int offset)
*/
seq = ct_state_inc(offset);
// RCU is now watching. Better not be in an extended quiescent state!
- rcu_dynticks_task_trace_exit(); // After ->dynticks update!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
+ rcu_task_trace_heavyweight_exit(); // After CT state update!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING));
}
/*
* Enter an RCU extended quiescent state, which can be either the
* idle loop or adaptive-tickless usermode execution.
*
- * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
+ * We crowbar the ->nmi_nesting field to zero to allow for
* the possibility of usermode upcalls having messed up our count
* of interrupt nesting level during the prior busy period.
*/
@@ -125,19 +125,19 @@ static void noinstr ct_kernel_exit(bool user, int offset)
{
struct context_tracking *ct = this_cpu_ptr(&context_tracking);
- WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE);
- WRITE_ONCE(ct->dynticks_nmi_nesting, 0);
+ WARN_ON_ONCE(ct_nmi_nesting() != CT_NESTING_IRQ_NONIDLE);
+ WRITE_ONCE(ct->nmi_nesting, 0);
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- ct_dynticks_nesting() == 0);
- if (ct_dynticks_nesting() != 1) {
+ ct_nesting() == 0);
+ if (ct_nesting() != 1) {
// RCU will still be watching, so just do accounting and leave.
- ct->dynticks_nesting--;
+ ct->nesting--;
return;
}
instrumentation_begin();
lockdep_assert_irqs_disabled();
- trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks());
+ trace_rcu_watching(TPS("End"), ct_nesting(), 0, ct_rcu_watching());
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rcu_preempt_deferred_qs(current);
@@ -145,18 +145,18 @@ static void noinstr ct_kernel_exit(bool user, int offset)
instrument_atomic_write(&ct->state, sizeof(ct->state));
instrumentation_end();
- WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
+ WRITE_ONCE(ct->nesting, 0); /* Avoid irq-access tearing. */
// RCU is watching here ...
ct_kernel_exit_state(offset);
// ... but is no longer watching here.
- rcu_dynticks_task_enter();
+ rcu_task_exit();
}
/*
* Exit an RCU extended quiescent state, which can be either the
* idle loop or adaptive-tickless usermode execution.
*
- * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
+ * We crowbar the ->nmi_nesting field to CT_NESTING_IRQ_NONIDLE to
* allow for the possibility of usermode upcalls messing up our count of
* interrupt nesting level during the busy period that is just now starting.
*/
@@ -166,14 +166,14 @@ static void noinstr ct_kernel_enter(bool user, int offset)
long oldval;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
- oldval = ct_dynticks_nesting();
+ oldval = ct_nesting();
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
if (oldval) {
// RCU was already watching, so just do accounting and leave.
- ct->dynticks_nesting++;
+ ct->nesting++;
return;
}
- rcu_dynticks_task_exit();
+ rcu_task_enter();
// RCU is not watching here ...
ct_kernel_enter_state(offset);
// ... but is watching here.
@@ -182,11 +182,11 @@ static void noinstr ct_kernel_enter(bool user, int offset)
// instrumentation for the noinstr ct_kernel_enter_state()
instrument_atomic_write(&ct->state, sizeof(ct->state));
- trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
+ trace_rcu_watching(TPS("Start"), ct_nesting(), 1, ct_rcu_watching());
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- WRITE_ONCE(ct->dynticks_nesting, 1);
- WARN_ON_ONCE(ct_dynticks_nmi_nesting());
- WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
+ WRITE_ONCE(ct->nesting, 1);
+ WARN_ON_ONCE(ct_nmi_nesting());
+ WRITE_ONCE(ct->nmi_nesting, CT_NESTING_IRQ_NONIDLE);
instrumentation_end();
}
@@ -194,7 +194,7 @@ static void noinstr ct_kernel_enter(bool user, int offset)
* ct_nmi_exit - inform RCU of exit from NMI context
*
* If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting
+ * RCU-idle period, update ct->state and ct->nmi_nesting
* to let the RCU grace-period handling know that the CPU is back to
* being RCU-idle.
*
@@ -207,47 +207,47 @@ void noinstr ct_nmi_exit(void)
instrumentation_begin();
/*
- * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * Check for ->nmi_nesting underflow and bad CT state.
* (We are exiting an NMI handler, so RCU better be paying attention
* to us!)
*/
- WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0);
- WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
+ WARN_ON_ONCE(ct_nmi_nesting() <= 0);
+ WARN_ON_ONCE(!rcu_is_watching_curr_cpu());
/*
* If the nesting level is not 1, the CPU wasn't RCU-idle, so
* leave it in non-RCU-idle state.
*/
- if (ct_dynticks_nmi_nesting() != 1) {
- trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2,
- ct_dynticks());
- WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */
- ct_dynticks_nmi_nesting() - 2);
+ if (ct_nmi_nesting() != 1) {
+ trace_rcu_watching(TPS("--="), ct_nmi_nesting(), ct_nmi_nesting() - 2,
+ ct_rcu_watching());
+ WRITE_ONCE(ct->nmi_nesting, /* No store tearing. */
+ ct_nmi_nesting() - 2);
instrumentation_end();
return;
}
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
- WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
+ trace_rcu_watching(TPS("Endirq"), ct_nmi_nesting(), 0, ct_rcu_watching());
+ WRITE_ONCE(ct->nmi_nesting, 0); /* Avoid store tearing. */
// instrumentation for the noinstr ct_kernel_exit_state()
instrument_atomic_write(&ct->state, sizeof(ct->state));
instrumentation_end();
// RCU is watching here ...
- ct_kernel_exit_state(RCU_DYNTICKS_IDX);
+ ct_kernel_exit_state(CT_RCU_WATCHING);
// ... but is no longer watching here.
if (!in_nmi())
- rcu_dynticks_task_enter();
+ rcu_task_exit();
}
/**
* ct_nmi_enter - inform RCU of entry to NMI context
*
* If the CPU was idle from RCU's viewpoint, update ct->state and
- * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * ct->nmi_nesting to let the RCU grace-period handling know
* that the CPU is active. This implementation permits nested NMIs, as
* long as the nesting level does not overflow an int. (You will probably
* run out of stack space first.)
@@ -261,27 +261,27 @@ void noinstr ct_nmi_enter(void)
struct context_tracking *ct = this_cpu_ptr(&context_tracking);
/* Complain about underflow. */
- WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0);
+ WARN_ON_ONCE(ct_nmi_nesting() < 0);
/*
- * If idle from RCU viewpoint, atomically increment ->dynticks
- * to mark non-idle and increment ->dynticks_nmi_nesting by one.
- * Otherwise, increment ->dynticks_nmi_nesting by two. This means
- * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+ * If idle from RCU viewpoint, atomically increment CT state
+ * to mark non-idle and increment ->nmi_nesting by one.
+ * Otherwise, increment ->nmi_nesting by two. This means
+ * if ->nmi_nesting is equal to one, we are guaranteed
* to be in the outermost NMI handler that interrupted an RCU-idle
* period (observation due to Andy Lutomirski).
*/
- if (rcu_dynticks_curr_cpu_in_eqs()) {
+ if (!rcu_is_watching_curr_cpu()) {
if (!in_nmi())
- rcu_dynticks_task_exit();
+ rcu_task_enter();
// RCU is not watching here ...
- ct_kernel_enter_state(RCU_DYNTICKS_IDX);
+ ct_kernel_enter_state(CT_RCU_WATCHING);
// ... but is watching here.
instrumentation_begin();
- // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
+ // instrumentation for the noinstr rcu_is_watching_curr_cpu()
instrument_atomic_read(&ct->state, sizeof(ct->state));
// instrumentation for the noinstr ct_kernel_enter_state()
instrument_atomic_write(&ct->state, sizeof(ct->state));
@@ -294,12 +294,12 @@ void noinstr ct_nmi_enter(void)
instrumentation_begin();
}
- trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
- ct_dynticks_nmi_nesting(),
- ct_dynticks_nmi_nesting() + incby, ct_dynticks());
+ trace_rcu_watching(incby == 1 ? TPS("Startirq") : TPS("++="),
+ ct_nmi_nesting(),
+ ct_nmi_nesting() + incby, ct_rcu_watching());
instrumentation_end();
- WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */
- ct_dynticks_nmi_nesting() + incby);
+ WRITE_ONCE(ct->nmi_nesting, /* Prevent store tearing. */
+ ct_nmi_nesting() + incby);
barrier();
}
@@ -317,7 +317,7 @@ void noinstr ct_nmi_enter(void)
void noinstr ct_idle_enter(void)
{
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
- ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE);
+ ct_kernel_exit(false, CT_RCU_WATCHING + CT_STATE_IDLE);
}
EXPORT_SYMBOL_GPL(ct_idle_enter);
@@ -335,7 +335,7 @@ void noinstr ct_idle_exit(void)
unsigned long flags;
raw_local_irq_save(flags);
- ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE);
+ ct_kernel_enter(false, CT_RCU_WATCHING - CT_STATE_IDLE);
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ct_idle_exit);
@@ -485,7 +485,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
* user_exit() or ct_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
- if (state == CONTEXT_USER) {
+ if (state == CT_STATE_USER) {
instrumentation_begin();
trace_user_enter(0);
vtime_user_enter(current);
@@ -504,7 +504,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
* CPU doesn't need to maintain the tick for RCU maintenance purposes
* when the CPU runs in userspace.
*/
- ct_kernel_exit(true, RCU_DYNTICKS_IDX + state);
+ ct_kernel_exit(true, CT_RCU_WATCHING + state);
/*
* Special case if we only track user <-> kernel transitions for tickless
@@ -534,7 +534,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
/*
* Tracking for vtime and RCU EQS. Make sure we don't race
* with NMIs. OTOH we don't care about ordering here since
- * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * RCU only requires CT_RCU_WATCHING increments to be fully
* ordered.
*/
raw_atomic_add(state, &ct->state);
@@ -620,8 +620,8 @@ void noinstr __ct_user_exit(enum ctx_state state)
* Exit RCU idle mode while entering the kernel because it can
* run a RCU read side critical section anytime.
*/
- ct_kernel_enter(true, RCU_DYNTICKS_IDX - state);
- if (state == CONTEXT_USER) {
+ ct_kernel_enter(true, CT_RCU_WATCHING - state);
+ if (state == CT_STATE_USER) {
instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
@@ -634,17 +634,17 @@ void noinstr __ct_user_exit(enum ctx_state state)
* In this we case we don't care about any concurrency/ordering.
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
- raw_atomic_set(&ct->state, CONTEXT_KERNEL);
+ raw_atomic_set(&ct->state, CT_STATE_KERNEL);
} else {
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
/* Tracking for vtime only, no concurrent RCU EQS accounting */
- raw_atomic_set(&ct->state, CONTEXT_KERNEL);
+ raw_atomic_set(&ct->state, CT_STATE_KERNEL);
} else {
/*
* Tracking for vtime and RCU EQS. Make sure we don't race
* with NMIs. OTOH we don't care about ordering here since
- * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * RCU only requires CT_RCU_WATCHING increments to be fully
* ordered.
*/
raw_atomic_sub(state, &ct->state);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1209ddaec026..d293d52a3e00 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -330,7 +330,7 @@ static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state st
/* Poll for one millisecond */
arch_cpuhp_sync_state_poll();
} else {
- usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE);
+ usleep_range(USEC_PER_MSEC, 2 * USEC_PER_MSEC);
}
sync = atomic_read(st);
}
@@ -1808,6 +1808,7 @@ static int __init parallel_bringup_parse_param(char *arg)
}
early_param("cpuhp.parallel", parallel_bringup_parse_param);
+#ifdef CONFIG_HOTPLUG_SMT
static inline bool cpuhp_smt_aware(void)
{
return cpu_smt_max_threads > 1;
@@ -1817,6 +1818,21 @@ static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
{
return cpu_primary_thread_mask;
}
+#else
+static inline bool cpuhp_smt_aware(void)
+{
+ return false;
+}
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+ return cpu_none_mask;
+}
+#endif
+
+bool __weak arch_cpuhp_init_parallel_bringup(void)
+{
+ return true;
+}
/*
* On architectures which have enabled parallel bringup this invokes all BP
@@ -2689,6 +2705,14 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
return ret;
}
+/* Check if the core a CPU belongs to is online */
+#if !defined(topology_is_core_online)
+static inline bool topology_is_core_online(unsigned int cpu)
+{
+ return true;
+}
+#endif
+
int cpuhp_smt_enable(void)
{
int cpu, ret = 0;
@@ -2699,7 +2723,7 @@ int cpuhp_smt_enable(void)
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
continue;
- if (!cpu_smt_thread_allowed(cpu))
+ if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu))
continue;
ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
if (ret)
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index d3b4cd12bdd1..64d44a52c011 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -423,7 +423,8 @@ retry:
if (high && search_end == CRASH_ADDR_HIGH_MAX) {
search_end = CRASH_ADDR_LOW_MAX;
search_base = 0;
- goto retry;
+ if (search_end != CRASH_ADDR_HIGH_MAX)
+ goto retry;
}
pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
crash_size);
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index a6e3792b15f8..d570535342cb 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -416,8 +416,11 @@ static unsigned long long phys_addr(struct dma_debug_entry *entry)
* dma_active_cacheline entry to track per event. dma_map_sg(), on the
* other hand, consumes a single dma_debug_entry, but inserts 'nents'
* entries into the tree.
+ *
+ * Use __GFP_NOWARN because the printk from an OOM, to netconsole, could end
+ * up right back in the DMA debugging code, leading to a deadlock.
*/
-static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC);
+static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC | __GFP_NOWARN);
static DEFINE_SPINLOCK(radix_lock);
#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT)
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 90843cc38588..5b6934e23c21 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -182,7 +182,7 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
unsigned long nr = syscall_get_nr(current, regs);
- CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+ CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
diff --git a/kernel/events/core.c b/kernel/events/core.c
index aa3450bdc227..b21c8f24a987 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1255,8 +1255,9 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::mutex
* perf_event::child_mutex;
* perf_event_context::lock
- * perf_event::mmap_mutex
* mmap_lock
+ * perf_event::mmap_mutex
+ * perf_buffer::aux_mutex
* perf_addr_filters_head::lock
*
* cpu_hotplug_lock
@@ -6373,12 +6374,11 @@ static void perf_mmap_close(struct vm_area_struct *vma)
event->pmu->event_unmapped(event, vma->vm_mm);
/*
- * rb->aux_mmap_count will always drop before rb->mmap_count and
- * event->mmap_count, so it is ok to use event->mmap_mutex to
- * serialize with perf_mmap here.
+ * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
+ * to avoid complications.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
- atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
/*
* Stop all AUX events that are writing to this buffer,
* so that we can free its AUX pages and corresponding PMU
@@ -6395,7 +6395,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
rb_free_aux(rb);
WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
- mutex_unlock(&event->mmap_mutex);
+ mutex_unlock(&rb->aux_mutex);
}
if (atomic_dec_and_test(&rb->mmap_count))
@@ -6483,6 +6483,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
+ struct mutex *aux_mutex = NULL;
struct perf_buffer *rb = NULL;
unsigned long locked, lock_limit;
unsigned long vma_size;
@@ -6531,6 +6532,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!rb)
goto aux_unlock;
+ aux_mutex = &rb->aux_mutex;
+ mutex_lock(aux_mutex);
+
aux_offset = READ_ONCE(rb->user_page->aux_offset);
aux_size = READ_ONCE(rb->user_page->aux_size);
@@ -6681,6 +6685,8 @@ unlock:
atomic_dec(&rb->mmap_count);
}
aux_unlock:
+ if (aux_mutex)
+ mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
/*
@@ -9706,7 +9712,8 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
- if (event->prog && !bpf_overflow_handler(event, data, regs))
+ if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+ !bpf_overflow_handler(event, data, regs))
return ret;
/*
@@ -13351,6 +13358,15 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
return &event->attr;
}
+int perf_allow_kernel(struct perf_event_attr *attr)
+{
+ if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
+ return -EACCES;
+
+ return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
+}
+EXPORT_SYMBOL_GPL(perf_allow_kernel);
+
/*
* Inherit an event from parent task to child task.
*
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 451514442a1b..e072d995d670 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -40,6 +40,7 @@ struct perf_buffer {
struct user_struct *mmap_user;
/* AUX area */
+ struct mutex aux_mutex;
long aux_head;
unsigned int aux_nest;
long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 8cadf97bc290..4f46f688d0d4 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -337,6 +337,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
*/
if (!rb->nr_pages)
rb->paused = 1;
+
+ mutex_init(&rb->aux_mutex);
}
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 73cc47708679..50d7949be2b1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1489,7 +1489,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
struct xol_area *area;
void *insns;
- area = kmalloc(sizeof(*area), GFP_KERNEL);
+ area = kzalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
goto out;
@@ -1499,7 +1499,6 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
goto free_area;
area->xol_mapping.name = "[uprobes]";
- area->xol_mapping.fault = NULL;
area->xol_mapping.pages = area->pages;
area->pages[0] = alloc_page(GFP_HIGHUSER);
if (!area->pages[0])
diff --git a/kernel/fork.c b/kernel/fork.c
index cc760491f201..d4b2d543f48c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1861,7 +1861,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
prev_cputime_init(&sig->prev_cputime);
#ifdef CONFIG_POSIX_TIMERS
- INIT_LIST_HEAD(&sig->posix_timers);
+ INIT_HLIST_HEAD(&sig->posix_timers);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
#endif
@@ -2311,7 +2311,6 @@ __latent_entropy struct task_struct *copy_process(
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
- p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index dc94e0bf2c94..271e9139de77 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -198,7 +198,7 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
irqd_clr_managed_shutdown(d);
- if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) {
+ if (!cpumask_intersects(aff, cpu_online_mask)) {
/*
* Catch code which fiddles with enable_irq() on a managed
* and potentially shutdown IRQ. Chained interrupt
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index eb8628390156..15a7654eff68 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -37,7 +37,7 @@ static inline bool irq_needs_fixup(struct irq_data *d)
* has been removed from the online mask already.
*/
if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
- cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
+ !cpumask_intersects(m, cpu_online_mask)) {
/*
* If this happens then there was a missed IRQ fixup at some
* point. Warn about it and enforce fixup.
@@ -110,7 +110,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
if (maskchip && chip->irq_mask)
chip->irq_mask(d);
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+ if (!cpumask_intersects(affinity, cpu_online_mask)) {
/*
* If the interrupt is managed, then shut it down and leave
* the affinity untouched.
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 3d4036db15ac..1a3d483548e2 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -13,7 +13,6 @@
struct irq_sim_work_ctx {
struct irq_work work;
- int irq_base;
unsigned int irq_count;
unsigned long *pending;
struct irq_domain *domain;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 07e99c936ba5..1dee88ba0ae4 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -530,6 +530,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
flags = IRQD_AFFINITY_MANAGED |
IRQD_MANAGED_SHUTDOWN;
}
+ flags |= IRQD_AFFINITY_SET;
mask = &affinity->mask;
node = cpu_to_node(cpumask_first(mask));
affinity++;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cea8f6874b1f..e0bff21f30e0 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -128,72 +128,98 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
}
EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
-static int irq_domain_set_name(struct irq_domain *domain,
- const struct fwnode_handle *fwnode,
- enum irq_domain_bus_token bus_token)
+static int alloc_name(struct irq_domain *domain, char *base, enum irq_domain_bus_token bus_token)
+{
+ if (bus_token == DOMAIN_BUS_ANY)
+ domain->name = kasprintf(GFP_KERNEL, "%s", base);
+ else
+ domain->name = kasprintf(GFP_KERNEL, "%s-%d", base, bus_token);
+ if (!domain->name)
+ return -ENOMEM;
+
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ return 0;
+}
+
+static int alloc_fwnode_name(struct irq_domain *domain, const struct fwnode_handle *fwnode,
+ enum irq_domain_bus_token bus_token, const char *suffix)
+{
+ const char *sep = suffix ? "-" : "";
+ const char *suf = suffix ? : "";
+ char *name;
+
+ if (bus_token == DOMAIN_BUS_ANY)
+ name = kasprintf(GFP_KERNEL, "%pfw%s%s", fwnode, sep, suf);
+ else
+ name = kasprintf(GFP_KERNEL, "%pfw%s%s-%d", fwnode, sep, suf, bus_token);
+ if (!name)
+ return -ENOMEM;
+
+ /*
+ * fwnode paths contain '/', which debugfs is legitimately unhappy
+ * about. Replace them with ':', which does the trick and is not as
+ * offensive as '\'...
+ */
+ domain->name = strreplace(name, '/', ':');
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ return 0;
+}
+
+static int alloc_unknown_name(struct irq_domain *domain, enum irq_domain_bus_token bus_token)
{
static atomic_t unknown_domains;
- struct irqchip_fwid *fwid;
+ int id = atomic_inc_return(&unknown_domains);
+
+ if (bus_token == DOMAIN_BUS_ANY)
+ domain->name = kasprintf(GFP_KERNEL, "unknown-%d", id);
+ else
+ domain->name = kasprintf(GFP_KERNEL, "unknown-%d-%d", id, bus_token);
+ if (!domain->name)
+ return -ENOMEM;
+
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ return 0;
+}
+
+static int irq_domain_set_name(struct irq_domain *domain, const struct irq_domain_info *info)
+{
+ enum irq_domain_bus_token bus_token = info->bus_token;
+ const struct fwnode_handle *fwnode = info->fwnode;
if (is_fwnode_irqchip(fwnode)) {
- fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+ struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+
+ /*
+ * The name_suffix is only intended to be used to avoid a name
+ * collision when multiple domains are created for a single
+ * device and the name is picked using a real device node.
+ * (Typical use-case is regmap-IRQ controllers for devices
+ * providing more than one physical IRQ.) There should be no
+ * need to use name_suffix with irqchip-fwnode.
+ */
+ if (info->name_suffix)
+ return -EINVAL;
switch (fwid->type) {
case IRQCHIP_FWNODE_NAMED:
case IRQCHIP_FWNODE_NAMED_ID:
- domain->name = bus_token ?
- kasprintf(GFP_KERNEL, "%s-%d",
- fwid->name, bus_token) :
- kstrdup(fwid->name, GFP_KERNEL);
- if (!domain->name)
- return -ENOMEM;
- domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
- break;
+ return alloc_name(domain, fwid->name, bus_token);
default:
domain->name = fwid->name;
- if (bus_token) {
- domain->name = kasprintf(GFP_KERNEL, "%s-%d",
- fwid->name, bus_token);
- if (!domain->name)
- return -ENOMEM;
- domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
- }
- break;
+ if (bus_token != DOMAIN_BUS_ANY)
+ return alloc_name(domain, fwid->name, bus_token);
}
- } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) ||
- is_software_node(fwnode)) {
- char *name;
-
- /*
- * fwnode paths contain '/', which debugfs is legitimately
- * unhappy about. Replace them with ':', which does
- * the trick and is not as offensive as '\'...
- */
- name = bus_token ?
- kasprintf(GFP_KERNEL, "%pfw-%d", fwnode, bus_token) :
- kasprintf(GFP_KERNEL, "%pfw", fwnode);
- if (!name)
- return -ENOMEM;
- domain->name = strreplace(name, '/', ':');
- domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || is_software_node(fwnode)) {
+ return alloc_fwnode_name(domain, fwnode, bus_token, info->name_suffix);
}
- if (!domain->name) {
- if (fwnode)
- pr_err("Invalid fwnode type for irqdomain\n");
- domain->name = bus_token ?
- kasprintf(GFP_KERNEL, "unknown-%d-%d",
- atomic_inc_return(&unknown_domains),
- bus_token) :
- kasprintf(GFP_KERNEL, "unknown-%d",
- atomic_inc_return(&unknown_domains));
- if (!domain->name)
- return -ENOMEM;
- domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
- }
+ if (domain->name)
+ return 0;
- return 0;
+ if (fwnode)
+ pr_err("Invalid fwnode type for irqdomain\n");
+ return alloc_unknown_name(domain, bus_token);
}
static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info)
@@ -211,7 +237,7 @@ static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info
if (!domain)
return ERR_PTR(-ENOMEM);
- err = irq_domain_set_name(domain, info->fwnode, info->bus_token);
+ err = irq_domain_set_name(domain, info);
if (err) {
kfree(domain);
return ERR_PTR(err);
@@ -267,13 +293,20 @@ static void irq_domain_free(struct irq_domain *domain)
kfree(domain);
}
-/**
- * irq_domain_instantiate() - Instantiate a new irq domain data structure
- * @info: Domain information pointer pointing to the information for this domain
- *
- * Return: A pointer to the instantiated irq domain or an ERR_PTR value.
- */
-struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info)
+static void irq_domain_instantiate_descs(const struct irq_domain_info *info)
+{
+ if (!IS_ENABLED(CONFIG_SPARSE_IRQ))
+ return;
+
+ if (irq_alloc_descs(info->virq_base, info->virq_base, info->size,
+ of_node_to_nid(to_of_node(info->fwnode))) < 0) {
+ pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
+ info->virq_base);
+ }
+}
+
+static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info *info,
+ bool cond_alloc_descs, bool force_associate)
{
struct irq_domain *domain;
int err;
@@ -306,6 +339,19 @@ struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info)
__irq_domain_publish(domain);
+ if (cond_alloc_descs && info->virq_base > 0)
+ irq_domain_instantiate_descs(info);
+
+ /*
+ * Legacy interrupt domains have a fixed Linux interrupt number
+ * associated. Other interrupt domains can request association by
+ * providing a Linux interrupt number > 0.
+ */
+ if (force_associate || info->virq_base > 0) {
+ irq_domain_associate_many(domain, info->virq_base, info->hwirq_base,
+ info->size - info->hwirq_base);
+ }
+
return domain;
err_domain_gc_remove:
@@ -315,6 +361,17 @@ err_domain_free:
irq_domain_free(domain);
return ERR_PTR(err);
}
+
+/**
+ * irq_domain_instantiate() - Instantiate a new irq domain data structure
+ * @info: Domain information pointer pointing to the information for this domain
+ *
+ * Return: A pointer to the instantiated irq domain or an ERR_PTR value.
+ */
+struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info)
+{
+ return __irq_domain_instantiate(info, false, false);
+}
EXPORT_SYMBOL_GPL(irq_domain_instantiate);
/**
@@ -413,28 +470,13 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
.fwnode = fwnode,
.size = size,
.hwirq_max = size,
+ .virq_base = first_irq,
.ops = ops,
.host_data = host_data,
};
- struct irq_domain *domain;
-
- domain = irq_domain_instantiate(&info);
- if (IS_ERR(domain))
- return NULL;
+ struct irq_domain *domain = __irq_domain_instantiate(&info, true, false);
- if (first_irq > 0) {
- if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
- /* attempt to allocated irq_descs */
- int rc = irq_alloc_descs(first_irq, first_irq, size,
- of_node_to_nid(to_of_node(fwnode)));
- if (rc < 0)
- pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
- first_irq);
- }
- irq_domain_associate_many(domain, first_irq, 0, size);
- }
-
- return domain;
+ return IS_ERR(domain) ? NULL : domain;
}
EXPORT_SYMBOL_GPL(irq_domain_create_simple);
@@ -476,18 +518,14 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
.fwnode = fwnode,
.size = first_hwirq + size,
.hwirq_max = first_hwirq + size,
+ .hwirq_base = first_hwirq,
+ .virq_base = first_irq,
.ops = ops,
.host_data = host_data,
};
- struct irq_domain *domain;
+ struct irq_domain *domain = __irq_domain_instantiate(&info, false, true);
- domain = irq_domain_instantiate(&info);
- if (IS_ERR(domain))
- return NULL;
-
- irq_domain_associate_many(domain, first_irq, first_hwirq, size);
-
- return domain;
+ return IS_ERR(domain) ? NULL : domain;
}
EXPORT_SYMBOL_GPL(irq_domain_create_legacy);
@@ -1365,7 +1403,7 @@ static int irq_domain_trim_hierarchy(unsigned int virq)
tail = NULL;
/* The first entry must have a valid irqchip */
- if (!irq_data->chip || IS_ERR(irq_data->chip))
+ if (IS_ERR_OR_NULL(irq_data->chip))
return -EINVAL;
/*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index dd53298ef1a5..f0803d6bd296 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -218,21 +218,20 @@ static void irq_validate_effective_affinity(struct irq_data *data)
static inline void irq_validate_effective_affinity(struct irq_data *data) { }
#endif
+static DEFINE_PER_CPU(struct cpumask, __tmp_mask);
+
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
+ struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask);
struct irq_desc *desc = irq_data_to_desc(data);
struct irq_chip *chip = irq_data_get_irq_chip(data);
const struct cpumask *prog_mask;
int ret;
- static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
- static struct cpumask tmp_mask;
-
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- raw_spin_lock(&tmp_mask_lock);
/*
* If this is a managed interrupt and housekeeping is enabled on
* it check whether the requested affinity mask intersects with
@@ -258,11 +257,11 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
- cpumask_and(&tmp_mask, mask, hk_mask);
- if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
+ cpumask_and(tmp_mask, mask, hk_mask);
+ if (!cpumask_intersects(tmp_mask, cpu_online_mask))
prog_mask = mask;
else
- prog_mask = &tmp_mask;
+ prog_mask = tmp_mask;
} else {
prog_mask = mask;
}
@@ -272,16 +271,14 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
* unless we are being asked to force the affinity (in which
* case we do as we are told).
*/
- cpumask_and(&tmp_mask, prog_mask, cpu_online_mask);
- if (!force && !cpumask_empty(&tmp_mask))
- ret = chip->irq_set_affinity(data, &tmp_mask, force);
+ cpumask_and(tmp_mask, prog_mask, cpu_online_mask);
+ if (!force && !cpumask_empty(tmp_mask))
+ ret = chip->irq_set_affinity(data, tmp_mask, force);
else if (force)
ret = chip->irq_set_affinity(data, mask, force);
else
ret = -EINVAL;
- raw_spin_unlock(&tmp_mask_lock);
-
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 61ca924ef4b4..eb150afd671f 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -26,7 +26,7 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
* The outgoing CPU might be the last online target in a pending
* interrupt move. If that's the case clear the pending move bit.
*/
- if (cpumask_any_and(desc->pending_mask, cpu_online_mask) >= nr_cpu_ids) {
+ if (!cpumask_intersects(desc->pending_mask, cpu_online_mask)) {
irqd_clr_move_pending(data);
return false;
}
@@ -74,7 +74,7 @@ void irq_move_masked_irq(struct irq_data *idata)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) {
+ if (cpumask_intersects(desc->pending_mask, cpu_online_mask)) {
int ret;
ret = irq_do_set_affinity(data, desc->pending_mask, false);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 5fa0547ece0c..1c7e5159064c 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -82,7 +82,7 @@ static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
desc->dev = dev;
desc->nvec_used = nvec;
if (affinity) {
- desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL);
+ desc->affinity = kmemdup_array(affinity, nvec, sizeof(*desc->affinity), GFP_KERNEL);
if (!desc->affinity) {
kfree(desc);
return NULL;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 8cccdf40725a..9081ada81c3d 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -52,10 +52,8 @@ static int show_irq_affinity(int type, struct seq_file *m)
case AFFINITY:
case AFFINITY_LIST:
mask = desc->irq_common_data.affinity;
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (irqd_is_setaffinity_pending(&desc->irq_data))
- mask = desc->pending_mask;
-#endif
+ if (irq_move_pending(&desc->irq_data))
+ mask = irq_desc_get_pending_mask(desc);
break;
case EFFECTIVE:
case EFFECTIVE_LIST:
@@ -142,7 +140,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
int err;
if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
- return -EIO;
+ return -EPERM;
if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
@@ -362,8 +360,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
goto out_unlock;
#ifdef CONFIG_SMP
+ umode_t umode = S_IRUGO;
+
+ if (irq_can_set_affinity_usr(desc->irq_data.irq))
+ umode |= S_IWUSR;
+
/* create /proc/irq/<irq>/smp_affinity */
- proc_create_data("smp_affinity", 0644, desc->dir,
+ proc_create_data("smp_affinity", umode, desc->dir,
&irq_affinity_proc_ops, irqp);
/* create /proc/irq/<irq>/affinity_hint */
@@ -371,7 +374,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
irq_affinity_hint_proc_show, irqp);
/* create /proc/irq/<irq>/smp_affinity_list */
- proc_create_data("smp_affinity_list", 0644, desc->dir,
+ proc_create_data("smp_affinity_list", umode, desc->dir,
&irq_affinity_list_proc_ops, irqp);
proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show,
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 4ad5ed8adf96..6dc76b590703 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -236,7 +236,7 @@ void static_key_disable_cpuslocked(struct static_key *key)
}
jump_label_lock();
- if (atomic_cmpxchg(&key->enabled, 1, 0))
+ if (atomic_cmpxchg(&key->enabled, 1, 0) == 1)
jump_label_update(key);
jump_label_unlock();
}
@@ -289,7 +289,7 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key)
return;
guard(mutex)(&jump_label_mutex);
- if (atomic_cmpxchg(&key->enabled, 1, 0))
+ if (atomic_cmpxchg(&key->enabled, 1, 0) == 1)
jump_label_update(key);
else
WARN_ON_ONCE(!static_key_slow_try_dec(key));
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fb2c77368d18..a9a0ca605d4a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -160,38 +160,6 @@ unsigned long kallsyms_sym_address(int idx)
return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
}
-static void cleanup_symbol_name(char *s)
-{
- char *res;
-
- if (!IS_ENABLED(CONFIG_LTO_CLANG))
- return;
-
- /*
- * LLVM appends various suffixes for local functions and variables that
- * must be promoted to global scope as part of LTO. This can break
- * hooking of static functions with kprobes. '.' is not a valid
- * character in an identifier in C. Suffixes only in LLVM LTO observed:
- * - foo.llvm.[0-9a-f]+
- */
- res = strstr(s, ".llvm.");
- if (res)
- *res = '\0';
-
- return;
-}
-
-static int compare_symbol_name(const char *name, char *namebuf)
-{
- /* The kallsyms_seqs_of_names is sorted based on names after
- * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled.
- * To ensure correct bisection in kallsyms_lookup_names(), do
- * cleanup_symbol_name(namebuf) before comparing name and namebuf.
- */
- cleanup_symbol_name(namebuf);
- return strcmp(name, namebuf);
-}
-
static unsigned int get_symbol_seq(int index)
{
unsigned int i, seq = 0;
@@ -219,7 +187,7 @@ static int kallsyms_lookup_names(const char *name,
seq = get_symbol_seq(mid);
off = get_symbol_offset(seq);
kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- ret = compare_symbol_name(name, namebuf);
+ ret = strcmp(name, namebuf);
if (ret > 0)
low = mid + 1;
else if (ret < 0)
@@ -236,7 +204,7 @@ static int kallsyms_lookup_names(const char *name,
seq = get_symbol_seq(low - 1);
off = get_symbol_offset(seq);
kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- if (compare_symbol_name(name, namebuf))
+ if (strcmp(name, namebuf))
break;
low--;
}
@@ -248,7 +216,7 @@ static int kallsyms_lookup_names(const char *name,
seq = get_symbol_seq(high + 1);
off = get_symbol_offset(seq);
kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- if (compare_symbol_name(name, namebuf))
+ if (strcmp(name, namebuf))
break;
high++;
}
@@ -407,8 +375,7 @@ static int kallsyms_lookup_buildid(unsigned long addr,
if (modbuildid)
*modbuildid = NULL;
- ret = strlen(namebuf);
- goto found;
+ return strlen(namebuf);
}
/* See if it's in a module or a BPF JITed image. */
@@ -422,8 +389,6 @@ static int kallsyms_lookup_buildid(unsigned long addr,
ret = ftrace_mod_address_lookup(addr, symbolsize,
offset, modname, namebuf);
-found:
- cleanup_symbol_name(namebuf);
return ret;
}
@@ -450,8 +415,6 @@ const char *kallsyms_lookup(unsigned long addr,
int lookup_symbol_name(unsigned long addr, char *symname)
{
- int res;
-
symname[0] = '\0';
symname[KSYM_NAME_LEN - 1] = '\0';
@@ -462,16 +425,10 @@ int lookup_symbol_name(unsigned long addr, char *symname)
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
symname, KSYM_NAME_LEN);
- goto found;
+ return 0;
}
/* See if it's in a module. */
- res = lookup_module_symbol_name(addr, symname);
- if (res)
- return res;
-
-found:
- cleanup_symbol_name(symname);
- return 0;
+ return lookup_module_symbol_name(addr, symname);
}
/* Look up a kernel symbol and return it in a text buffer. */
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index 2f84896a7bcb..873f7c445488 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -187,31 +187,11 @@ static void test_perf_kallsyms_lookup_name(void)
stat.min, stat.max, div_u64(stat.sum, stat.real_cnt));
}
-static bool match_cleanup_name(const char *s, const char *name)
-{
- char *p;
- int len;
-
- if (!IS_ENABLED(CONFIG_LTO_CLANG))
- return false;
-
- p = strstr(s, ".llvm.");
- if (!p)
- return false;
-
- len = strlen(name);
- if (p - s != len)
- return false;
-
- return !strncmp(s, name, len);
-}
-
static int find_symbol(void *data, const char *name, unsigned long addr)
{
struct test_stat *stat = (struct test_stat *)data;
- if (strcmp(name, stat->name) == 0 ||
- (!stat->perf && match_cleanup_name(name, stat->name))) {
+ if (!strcmp(name, stat->name)) {
stat->real_cnt++;
stat->addr = addr;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index f0a69d402066..28a6be6e64fd 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/hashtable.h>
#include <linux/init.h>
+#include <linux/jiffies.h>
#include <linux/kmsan-checks.h>
#include <linux/mm.h>
#include <linux/preempt.h>
@@ -161,6 +162,15 @@ static void kcov_remote_area_put(struct kcov_remote_area *area,
kmsan_unpoison_memory(&area->list, sizeof(area->list));
}
+/*
+ * Unlike in_serving_softirq(), this function returns false when called during
+ * a hardirq or an NMI that happened in the softirq context.
+ */
+static inline bool in_softirq_really(void)
+{
+ return in_serving_softirq() && !in_hardirq() && !in_nmi();
+}
+
static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
{
unsigned int mode;
@@ -170,7 +180,7 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru
* so we ignore code executed in interrupts, unless we are in a remote
* coverage collection section in a softirq.
*/
- if (!in_task() && !(in_serving_softirq() && t->kcov_softirq))
+ if (!in_task() && !(in_softirq_really() && t->kcov_softirq))
return false;
mode = READ_ONCE(t->kcov_mode);
/*
@@ -849,7 +859,7 @@ void kcov_remote_start(u64 handle)
if (WARN_ON(!kcov_check_handle(handle, true, true, true)))
return;
- if (!in_task() && !in_serving_softirq())
+ if (!in_task() && !in_softirq_really())
return;
local_lock_irqsave(&kcov_percpu_data.lock, flags);
@@ -991,7 +1001,7 @@ void kcov_remote_stop(void)
int sequence;
unsigned long flags;
- if (!in_task() && !in_serving_softirq())
+ if (!in_task() && !in_softirq_really())
return;
local_lock_irqsave(&kcov_percpu_data.lock, flags);
@@ -1058,6 +1068,32 @@ u64 kcov_common_handle(void)
}
EXPORT_SYMBOL(kcov_common_handle);
+#ifdef CONFIG_KCOV_SELFTEST
+static void __init selftest(void)
+{
+ unsigned long start;
+
+ pr_err("running self test\n");
+ /*
+ * Test that interrupts don't produce spurious coverage.
+ * The coverage callback filters out interrupt code, but only
+ * after the handler updates preempt count. Some code periodically
+ * leaks out of that section and leads to spurious coverage.
+ * It's hard to call the actual interrupt handler directly,
+ * so we just loop here for a bit waiting for a timer interrupt.
+ * We set kcov_mode to enable tracing, but don't setup the area,
+ * so any attempt to trace will crash. Note: we must not call any
+ * potentially traced functions in this region.
+ */
+ start = jiffies;
+ current->kcov_mode = KCOV_MODE_TRACE_PC;
+ while ((jiffies - start) * MSEC_PER_SEC / HZ < 300)
+ ;
+ current->kcov_mode = 0;
+ pr_err("done running self test\n");
+}
+#endif
+
static int __init kcov_init(void)
{
int cpu;
@@ -1077,6 +1113,10 @@ static int __init kcov_init(void)
*/
debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
+#ifdef CONFIG_KCOV_SELFTEST
+ selftest();
+#endif
+
return 0;
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 3d64290d24c9..3eedb8c226ad 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -752,7 +752,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
#ifdef CONFIG_CRASH_HOTPLUG
/* Exclude elfcorehdr segment to allow future changes via hotplug */
- if (j == image->elfcorehdr_index)
+ if (i == image->elfcorehdr_index)
continue;
#endif
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e85de37d9e1e..da59c68df841 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1557,8 +1557,8 @@ static bool is_cfi_preamble_symbol(unsigned long addr)
if (lookup_symbol_name(addr, symbuf))
return false;
- return str_has_prefix("__cfi_", symbuf) ||
- str_has_prefix("__pfx_", symbuf);
+ return str_has_prefix(symbuf, "__cfi_") ||
+ str_has_prefix(symbuf, "__pfx_");
}
static int check_kprobe_address_safe(struct kprobe *p,
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 07fb5987b42b..1bab21b4718f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -92,7 +92,14 @@ static ssize_t profiling_store(struct kobject *kobj,
const char *buf, size_t count)
{
int ret;
+ static DEFINE_MUTEX(lock);
+ /*
+ * We need serialization, for profile_setup() initializes prof_on
+ * value and profile_init() must not reallocate prof_buffer after
+ * once allocated.
+ */
+ guard(mutex)(&lock);
if (prof_on)
return -EEXIST;
/*
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 58c88220a478..7963deac33c3 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -56,6 +56,7 @@
#include <linux/kprobes.h>
#include <linux/lockdep.h>
#include <linux/context_tracking.h>
+#include <linux/console.h>
#include <asm/sections.h>
@@ -573,8 +574,10 @@ static struct lock_trace *save_trace(void)
if (!debug_locks_off_graph_unlock())
return NULL;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return NULL;
}
@@ -887,11 +890,13 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
instrumentation_begin();
debug_locks_off();
+ nbcon_cpu_emergency_enter();
printk(KERN_ERR
"BUG: looking up invalid subclass: %u\n", subclass);
printk(KERN_ERR
"turning off the locking correctness validator.\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
instrumentation_end();
return NULL;
}
@@ -968,11 +973,13 @@ static bool assign_lock_key(struct lockdep_map *lock)
else {
/* Debug-check: all keys must be persistent! */
debug_locks_off();
+ nbcon_cpu_emergency_enter();
pr_err("INFO: trying to register non-static key.\n");
pr_err("The code is fine but needs lockdep annotation, or maybe\n");
pr_err("you didn't initialize this object before use?\n");
pr_err("turning off the locking correctness validator.\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
return false;
}
@@ -1316,8 +1323,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
return NULL;
}
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return NULL;
}
nr_lock_classes++;
@@ -1349,11 +1358,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
if (verbose(class)) {
graph_unlock();
+ nbcon_cpu_emergency_enter();
printk("\nnew class %px: %s", class->key, class->name);
if (class->name_version > 1)
printk(KERN_CONT "#%d", class->name_version);
printk(KERN_CONT "\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
if (!graph_lock()) {
return NULL;
@@ -1392,8 +1403,10 @@ static struct lock_list *alloc_list_entry(void)
if (!debug_locks_off_graph_unlock())
return NULL;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return NULL;
}
nr_list_entries++;
@@ -2039,6 +2052,8 @@ static noinline void print_circular_bug(struct lock_list *this,
depth = get_lock_depth(target);
+ nbcon_cpu_emergency_enter();
+
print_circular_bug_header(target, depth, check_src, check_tgt);
parent = get_lock_parent(target);
@@ -2057,6 +2072,8 @@ static noinline void print_circular_bug(struct lock_list *this,
printk("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static noinline void print_bfs_bug(int ret)
@@ -2569,6 +2586,8 @@ print_bad_irq_dependency(struct task_struct *curr,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=====================================================\n");
pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
@@ -2618,11 +2637,13 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
next_root->trace = save_trace();
if (!next_root->trace)
- return;
+ goto out;
print_shortest_lock_dependencies(forwards_entry, next_root);
pr_warn("\nstack backtrace:\n");
dump_stack();
+out:
+ nbcon_cpu_emergency_exit();
}
static const char *state_names[] = {
@@ -2987,6 +3008,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("============================================\n");
pr_warn("WARNING: possible recursive locking detected\n");
@@ -3009,6 +3032,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
/*
@@ -3606,6 +3631,8 @@ static void print_collision(struct task_struct *curr,
struct held_lock *hlock_next,
struct lock_chain *chain)
{
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("============================\n");
pr_warn("WARNING: chain_key collision\n");
@@ -3622,6 +3649,8 @@ static void print_collision(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
#endif
@@ -3712,8 +3741,10 @@ static inline int add_chain_cache(struct task_struct *curr,
if (!debug_locks_off_graph_unlock())
return 0;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
chain->chain_key = chain_key;
@@ -3730,8 +3761,10 @@ static inline int add_chain_cache(struct task_struct *curr,
if (!debug_locks_off_graph_unlock())
return 0;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
@@ -3970,6 +4003,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
if (!debug_locks_off() || debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("================================\n");
pr_warn("WARNING: inconsistent lock state\n");
@@ -3998,6 +4033,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
/*
@@ -4032,6 +4069,8 @@ print_irq_inversion_bug(struct task_struct *curr,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("========================================================\n");
pr_warn("WARNING: possible irq lock inversion dependency detected\n");
@@ -4072,11 +4111,13 @@ print_irq_inversion_bug(struct task_struct *curr,
pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
root->trace = save_trace();
if (!root->trace)
- return;
+ goto out;
print_shortest_lock_dependencies(other, root);
pr_warn("\nstack backtrace:\n");
dump_stack();
+out:
+ nbcon_cpu_emergency_exit();
}
/*
@@ -4153,6 +4194,8 @@ void print_irqtrace_events(struct task_struct *curr)
{
const struct irqtrace_events *trace = &curr->irqtrace;
+ nbcon_cpu_emergency_enter();
+
printk("irq event stamp: %u\n", trace->irq_events);
printk("hardirqs last enabled at (%u): [<%px>] %pS\n",
trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip,
@@ -4166,6 +4209,8 @@ void print_irqtrace_events(struct task_struct *curr)
printk("softirqs last disabled at (%u): [<%px>] %pS\n",
trace->softirq_disable_event, (void *)trace->softirq_disable_ip,
(void *)trace->softirq_disable_ip);
+
+ nbcon_cpu_emergency_exit();
}
static int HARDIRQ_verbose(struct lock_class *class)
@@ -4686,10 +4731,12 @@ unlock:
* We must printk outside of the graph_lock:
*/
if (ret == 2) {
+ nbcon_cpu_emergency_enter();
printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
print_lock(this);
print_irqtrace_events(curr);
dump_stack();
+ nbcon_cpu_emergency_exit();
}
return ret;
@@ -4730,6 +4777,8 @@ print_lock_invalid_wait_context(struct task_struct *curr,
if (debug_locks_silent)
return 0;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=============================\n");
pr_warn("[ BUG: Invalid wait context ]\n");
@@ -4749,6 +4798,8 @@ print_lock_invalid_wait_context(struct task_struct *curr,
pr_warn("stack backtrace:\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
+
return 0;
}
@@ -4956,6 +5007,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("==================================\n");
pr_warn("WARNING: Nested lock was not taken\n");
@@ -4976,6 +5029,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static int __lock_is_held(const struct lockdep_map *lock, int read);
@@ -5024,11 +5079,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
debug_class_ops_inc(class);
if (very_verbose(class)) {
+ nbcon_cpu_emergency_enter();
printk("\nacquire class [%px] %s", class->key, class->name);
if (class->name_version > 1)
printk(KERN_CONT "#%d", class->name_version);
printk(KERN_CONT "\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
}
/*
@@ -5155,6 +5212,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
printk(KERN_DEBUG "depth: %i max: %lu!\n",
curr->lockdep_depth, MAX_LOCK_DEPTH);
@@ -5162,6 +5220,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockdep_print_held_locks(current);
debug_show_all_locks();
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
@@ -5181,6 +5240,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr,
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=====================================\n");
pr_warn("WARNING: bad unlock balance detected!\n");
@@ -5197,6 +5258,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static noinstr int match_held_lock(const struct held_lock *hlock,
@@ -5901,6 +5964,8 @@ static void print_lock_contention_bug(struct task_struct *curr,
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=================================\n");
pr_warn("WARNING: bad contention detected!\n");
@@ -5917,6 +5982,8 @@ static void print_lock_contention_bug(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static void
@@ -5936,6 +6003,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
+ if (unlikely(lock->key == &__lockdep_no_track__))
+ return;
+
hlock = find_held_lock(curr, lock, depth, &i);
if (!hlock) {
print_lock_contention_bug(curr, lock, ip);
@@ -5978,6 +6048,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
+ if (unlikely(lock->key == &__lockdep_no_track__))
+ return;
+
hlock = find_held_lock(curr, lock, depth, &i);
if (!hlock) {
print_lock_contention_bug(curr, lock, _RET_IP_);
@@ -6530,6 +6603,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=========================\n");
pr_warn("WARNING: held lock freed!\n");
@@ -6542,6 +6617,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static inline int not_in_range(const void* mem_from, unsigned long mem_len,
@@ -6588,6 +6665,8 @@ static void print_held_locks_bug(void)
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("====================================\n");
pr_warn("WARNING: %s/%d still has locks held!\n",
@@ -6597,6 +6676,8 @@ static void print_held_locks_bug(void)
lockdep_print_held_locks(current);
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
void debug_check_no_locks_held(void)
@@ -6654,6 +6735,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
if (unlikely(curr->lockdep_depth)) {
if (!debug_locks_off())
return;
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("================================================\n");
pr_warn("WARNING: lock held when returning to user space!\n");
@@ -6662,6 +6744,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
pr_warn("%s/%d is leaving the kernel with locks still held!\n",
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
+ nbcon_cpu_emergency_exit();
}
/*
@@ -6678,6 +6761,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("=============================\n");
pr_warn("WARNING: suspicious RCU usage\n");
@@ -6716,6 +6800,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
warn_rcu_exit(rcu);
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f5a36e67b593..ac2e22502741 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -357,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
- enum vcpu_state old = vcpu_halted;
+ u8 old = vcpu_halted;
/*
* If the vCPU is indeed halted, advance its state to match that of
* pv_wait_node(). If OTOH this fails, the vCPU was running and will
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 88d08eeb8bc0..fba1229f1de6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1644,6 +1644,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
}
static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_base *lock,
struct rt_mutex_waiter *w)
{
/*
@@ -1656,10 +1657,10 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
if (build_ww_mutex() && w->ww_ctx)
return;
- /*
- * Yell loudly and stop the task right here.
- */
+ raw_spin_unlock_irq(&lock->wait_lock);
+
WARN(1, "rtmutex deadlock detected\n");
+
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
rt_mutex_schedule();
@@ -1713,7 +1714,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
} else {
__set_current_state(TASK_RUNNING);
remove_waiter(lock, waiter);
- rt_mutex_handle_deadlock(ret, chwalk, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
}
/*
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index a10b2b9a6fdf..50ffcc413b54 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -5,7 +5,7 @@
# These are called from save_stack_trace() on slub debug path,
# and produce insane amounts of uninteresting coverage.
-KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_main.o := n
obj-y += main.o
obj-y += strict_rwx.o
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d9592195c5bb..71396e297499 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3104,7 +3104,7 @@ static bool idempotent(struct idempotent *u, const void *cookie)
struct idempotent *existing;
bool first;
- u->ret = 0;
+ u->ret = -EINTR;
u->cookie = cookie;
init_completion(&u->complete);
@@ -3140,7 +3140,7 @@ static int idempotent_complete(struct idempotent *u, int ret)
hlist_for_each_entry_safe(pos, next, head, entry) {
if (pos->cookie != cookie)
continue;
- hlist_del(&pos->entry);
+ hlist_del_init(&pos->entry);
pos->ret = ret;
complete(&pos->complete);
}
@@ -3148,6 +3148,28 @@ static int idempotent_complete(struct idempotent *u, int ret)
return ret;
}
+/*
+ * Wait for the idempotent worker.
+ *
+ * If we get interrupted, we need to remove ourselves from the
+ * the idempotent list, and the completion may still come in.
+ *
+ * The 'idem_lock' protects against the race, and 'idem.ret' was
+ * initialized to -EINTR and is thus always the right return
+ * value even if the idempotent work then completes between
+ * the wait_for_completion and the cleanup.
+ */
+static int idempotent_wait_for_completion(struct idempotent *u)
+{
+ if (wait_for_completion_interruptible(&u->complete)) {
+ spin_lock(&idem_lock);
+ if (!hlist_unhashed(&u->entry))
+ hlist_del(&u->entry);
+ spin_unlock(&idem_lock);
+ }
+ return u->ret;
+}
+
static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
{
struct load_info info = { };
@@ -3183,15 +3205,16 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int
if (!f || !(f->f_mode & FMODE_READ))
return -EBADF;
- /* See if somebody else is doing the operation? */
- if (idempotent(&idem, file_inode(f))) {
- wait_for_completion(&idem.complete);
- return idem.ret;
+ /* Are we the winners of the race and get to do this? */
+ if (!idempotent(&idem, file_inode(f))) {
+ int ret = init_module_from_file(f, uargs, flags);
+ return idempotent_complete(&idem, ret);
}
- /* Otherwise, we'll do it and complete others */
- return idempotent_complete(&idem,
- init_module_from_file(f, uargs, flags));
+ /*
+ * Somebody else won the race and is loading the module.
+ */
+ return idempotent_wait_for_completion(&idem);
}
SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
diff --git a/kernel/padata.c b/kernel/padata.c
index 53f4bc912712..d899f34558af 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -404,7 +404,8 @@ void padata_do_serial(struct padata_priv *padata)
/* Sort in ascending order of sequence number. */
list_for_each_prev(pos, &reorder->list) {
cur = list_entry(pos, struct padata_priv, list);
- if (cur->seq_nr < padata->seq_nr)
+ /* Compare by difference to consider integer wrap around */
+ if ((signed int)(cur->seq_nr - padata->seq_nr) < 0)
break;
}
list_add(&padata->list, pos);
@@ -512,11 +513,21 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
* thread function. Load balance large jobs between threads by
* increasing the number of chunks, guarantee at least the minimum
* chunk size from the caller, and honor the caller's alignment.
+ * Ensure chunk_size is at least 1 to prevent divide-by-0
+ * panic in padata_mt_helper().
*/
ps.chunk_size = job->size / (ps.nworks * load_balance_factor);
ps.chunk_size = max(ps.chunk_size, job->min_chunk);
+ ps.chunk_size = max(ps.chunk_size, 1ul);
ps.chunk_size = roundup(ps.chunk_size, job->align);
+ /*
+ * chunk_size can be 0 if the caller sets min_chunk to 0. So force it
+ * to at least 1 to prevent divide-by-0 panic in padata_mt_helper().`
+ */
+ if (!ps.chunk_size)
+ ps.chunk_size = 1U;
+
list_for_each_entry(pw, &works, pw_list)
if (job->numa_aware) {
int old_node = atomic_read(&last_used_nid);
diff --git a/kernel/panic.c b/kernel/panic.c
index f861bedc1925..753d12f4dc8f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -64,6 +64,8 @@ unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
static unsigned int warn_limit __read_mostly;
+bool panic_triggering_all_cpu_backtrace;
+
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -253,8 +255,12 @@ void check_panic_on_warn(const char *origin)
*/
static void panic_other_cpus_shutdown(bool crash_kexec)
{
- if (panic_print & PANIC_PRINT_ALL_CPU_BT)
+ if (panic_print & PANIC_PRINT_ALL_CPU_BT) {
+ /* Temporary allow non-panic CPUs to write their backtraces. */
+ panic_triggering_all_cpu_backtrace = true;
trigger_all_cpu_backtrace();
+ panic_triggering_all_cpu_backtrace = false;
+ }
/*
* Note that smp_send_stop() is the usual SMP shutdown function,
@@ -368,6 +374,8 @@ void panic(const char *fmt, ...)
panic_other_cpus_shutdown(_crash_kexec_post_notifiers);
+ printk_legacy_allow_panic_sync();
+
/*
* Run any panic handlers, including those that might need to
* add information to the kmsg dump output.
@@ -457,6 +465,7 @@ void panic(const char *fmt, ...)
* Explicitly flush the kernel log buffer one last time.
*/
console_flush_on_panic(CONSOLE_FLUSH_PENDING);
+ nbcon_atomic_flush_unsafe();
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
@@ -676,6 +685,7 @@ bool oops_may_print(void)
*/
void oops_enter(void)
{
+ nbcon_cpu_emergency_enter();
tracing_off();
/* can't trust the integrity of the kernel anymore: */
debug_locks_off();
@@ -698,6 +708,7 @@ void oops_exit(void)
{
do_oops_enter_exit();
print_oops_end_marker();
+ nbcon_cpu_emergency_exit();
kmsg_dump(KMSG_DUMP_OOPS);
}
@@ -709,6 +720,8 @@ struct warn_args {
void __warn(const char *file, int line, void *caller, unsigned taint,
struct pt_regs *regs, struct warn_args *args)
{
+ nbcon_cpu_emergency_enter();
+
disable_trace_on_warning();
if (file)
@@ -744,6 +757,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
+
+ nbcon_cpu_emergency_exit();
}
#ifdef CONFIG_BUG
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 0a213f69a9e4..e35829d36039 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1123,11 +1123,11 @@ static const char * const hibernation_modes[] = {
static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
+ ssize_t count = 0;
int i;
- char *start = buf;
if (!hibernation_available())
- return sprintf(buf, "[disabled]\n");
+ return sysfs_emit(buf, "[disabled]\n");
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (!hibernation_modes[i])
@@ -1147,12 +1147,16 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
continue;
}
if (i == hibernation_mode)
- buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
+ count += sysfs_emit_at(buf, count, "[%s] ", hibernation_modes[i]);
else
- buf += sprintf(buf, "%s ", hibernation_modes[i]);
+ count += sysfs_emit_at(buf, count, "%s ", hibernation_modes[i]);
}
- buf += sprintf(buf, "\n");
- return buf-start;
+
+ /* Convert the last space to a newline if needed. */
+ if (count > 0)
+ buf[count - 1] = '\n';
+
+ return count;
}
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1210,8 +1214,8 @@ power_attr(disk);
static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device),
- MINOR(swsusp_resume_device));
+ return sysfs_emit(buf, "%d:%d\n", MAJOR(swsusp_resume_device),
+ MINOR(swsusp_resume_device));
}
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1270,7 +1274,7 @@ power_attr(resume);
static ssize_t resume_offset_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%llu\n", (unsigned long long)swsusp_resume_block);
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)swsusp_resume_block);
}
static ssize_t resume_offset_store(struct kobject *kobj,
@@ -1293,7 +1297,7 @@ power_attr(resume_offset);
static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%lu\n", image_size);
+ return sysfs_emit(buf, "%lu\n", image_size);
}
static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1314,7 +1318,7 @@ power_attr(image_size);
static ssize_t reserved_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", reserved_size);
+ return sysfs_emit(buf, "%lu\n", reserved_size);
}
static ssize_t reserved_size_store(struct kobject *kobj,
diff --git a/kernel/power/main.c b/kernel/power/main.c
index a9e0693aaf69..6254814d4817 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -115,7 +115,7 @@ int pm_async_enabled = 1;
static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d\n", pm_async_enabled);
+ return sysfs_emit(buf, "%d\n", pm_async_enabled);
}
static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -139,7 +139,7 @@ power_attr(pm_async);
static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- char *s = buf;
+ ssize_t count = 0;
suspend_state_t i;
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) {
@@ -149,17 +149,17 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
const char *label = mem_sleep_states[i];
if (mem_sleep_current == i)
- s += sprintf(s, "[%s] ", label);
+ count += sysfs_emit_at(buf, count, "[%s] ", label);
else
- s += sprintf(s, "%s ", label);
+ count += sysfs_emit_at(buf, count, "%s ", label);
}
}
/* Convert the last space to a newline if needed. */
- if (s != buf)
- *(s-1) = '\n';
+ if (count > 0)
+ buf[count - 1] = '\n';
- return (s - buf);
+ return count;
}
static suspend_state_t decode_suspend_state(const char *buf, size_t n)
@@ -220,7 +220,7 @@ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC);
static ssize_t sync_on_suspend_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", sync_on_suspend_enabled);
+ return sysfs_emit(buf, "%d\n", sync_on_suspend_enabled);
}
static ssize_t sync_on_suspend_store(struct kobject *kobj,
@@ -257,22 +257,22 @@ static const char * const pm_tests[__TEST_AFTER_LAST] = {
static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- char *s = buf;
+ ssize_t count = 0;
int level;
for (level = TEST_FIRST; level <= TEST_MAX; level++)
if (pm_tests[level]) {
if (level == pm_test_level)
- s += sprintf(s, "[%s] ", pm_tests[level]);
+ count += sysfs_emit_at(buf, count, "[%s] ", pm_tests[level]);
else
- s += sprintf(s, "%s ", pm_tests[level]);
+ count += sysfs_emit_at(buf, count, "%s ", pm_tests[level]);
}
- if (s != buf)
- /* convert the last space to a newline */
- *(s-1) = '\n';
+ /* Convert the last space to a newline if needed. */
+ if (count > 0)
+ buf[count - 1] = '\n';
- return (s - buf);
+ return count;
}
static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -390,7 +390,7 @@ static const char * const suspend_step_names[] = {
static ssize_t _name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, char *buf) \
{ \
- return sprintf(buf, format_str, suspend_stats._name); \
+ return sysfs_emit(buf, format_str, suspend_stats._name);\
} \
static struct kobj_attribute _name = __ATTR_RO(_name)
@@ -404,7 +404,7 @@ suspend_attr(max_hw_sleep, "%llu\n");
static ssize_t _name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, char *buf) \
{ \
- return sprintf(buf, "%u\n", \
+ return sysfs_emit(buf, "%u\n", \
suspend_stats.step_failures[step-1]); \
} \
static struct kobj_attribute _name = __ATTR_RO(_name)
@@ -428,7 +428,7 @@ static ssize_t last_failed_dev_show(struct kobject *kobj,
index %= REC_FAILED_NUM;
last_failed_dev = suspend_stats.failed_devs[index];
- return sprintf(buf, "%s\n", last_failed_dev);
+ return sysfs_emit(buf, "%s\n", last_failed_dev);
}
static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev);
@@ -442,7 +442,7 @@ static ssize_t last_failed_errno_show(struct kobject *kobj,
index %= REC_FAILED_NUM;
last_failed_errno = suspend_stats.errno[index];
- return sprintf(buf, "%d\n", last_failed_errno);
+ return sysfs_emit(buf, "%d\n", last_failed_errno);
}
static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno);
@@ -456,7 +456,7 @@ static ssize_t last_failed_step_show(struct kobject *kobj,
index %= REC_FAILED_NUM;
step = suspend_stats.failed_steps[index];
- return sprintf(buf, "%s\n", suspend_step_names[step]);
+ return sysfs_emit(buf, "%s\n", suspend_step_names[step]);
}
static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step);
@@ -571,7 +571,7 @@ bool pm_print_times_enabled;
static ssize_t pm_print_times_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", pm_print_times_enabled);
+ return sysfs_emit(buf, "%d\n", pm_print_times_enabled);
}
static ssize_t pm_print_times_store(struct kobject *kobj,
@@ -604,7 +604,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
if (!pm_wakeup_irq())
return -ENODATA;
- return sprintf(buf, "%u\n", pm_wakeup_irq());
+ return sysfs_emit(buf, "%u\n", pm_wakeup_irq());
}
power_attr_ro(pm_wakeup_irq);
@@ -620,7 +620,7 @@ EXPORT_SYMBOL_GPL(pm_debug_messages_should_print);
static ssize_t pm_debug_messages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", pm_debug_messages_on);
+ return sysfs_emit(buf, "%d\n", pm_debug_messages_on);
}
static ssize_t pm_debug_messages_store(struct kobject *kobj,
@@ -668,21 +668,23 @@ struct kobject *power_kobj;
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- char *s = buf;
+ ssize_t count = 0;
#ifdef CONFIG_SUSPEND
suspend_state_t i;
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (pm_states[i])
- s += sprintf(s,"%s ", pm_states[i]);
+ count += sysfs_emit_at(buf, count, "%s ", pm_states[i]);
#endif
if (hibernation_available())
- s += sprintf(s, "disk ");
- if (s != buf)
- /* convert the last space to a newline */
- *(s-1) = '\n';
- return (s - buf);
+ count += sysfs_emit_at(buf, count, "disk ");
+
+ /* Convert the last space to a newline if needed. */
+ if (count > 0)
+ buf[count - 1] = '\n';
+
+ return count;
}
static suspend_state_t decode_state(const char *buf, size_t n)
@@ -782,7 +784,7 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
unsigned int val;
return pm_get_wakeup_count(&val, true) ?
- sprintf(buf, "%u\n", val) : -EINTR;
+ sysfs_emit(buf, "%u\n", val) : -EINTR;
}
static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -824,17 +826,17 @@ static ssize_t autosleep_show(struct kobject *kobj,
suspend_state_t state = pm_autosleep_state();
if (state == PM_SUSPEND_ON)
- return sprintf(buf, "off\n");
+ return sysfs_emit(buf, "off\n");
#ifdef CONFIG_SUSPEND
if (state < PM_SUSPEND_MAX)
- return sprintf(buf, "%s\n", pm_states[state] ?
+ return sysfs_emit(buf, "%s\n", pm_states[state] ?
pm_states[state] : "error");
#endif
#ifdef CONFIG_HIBERNATION
- return sprintf(buf, "disk\n");
+ return sysfs_emit(buf, "disk\n");
#else
- return sprintf(buf, "error");
+ return sysfs_emit(buf, "error\n");
#endif
}
@@ -903,7 +905,7 @@ int pm_trace_enabled;
static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d\n", pm_trace_enabled);
+ return sysfs_emit(buf, "%d\n", pm_trace_enabled);
}
static ssize_t
@@ -940,7 +942,7 @@ power_attr_ro(pm_trace_dev_match);
static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", freeze_timeout_msecs);
+ return sysfs_emit(buf, "%u\n", freeze_timeout_msecs);
}
static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 405eddbda4fc..30894d8f0a78 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1365,11 +1365,6 @@ static unsigned int count_highmem_pages(void)
}
return n;
}
-#else
-static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
-{
- return NULL;
-}
#endif /* CONFIG_HIGHMEM */
/**
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 19dcc5832651..3fcb48502adb 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -2,11 +2,12 @@
/*
* internal.h - printk internal definitions
*/
-#include <linux/percpu.h>
#include <linux/console.h>
-#include "printk_ringbuffer.h"
+#include <linux/percpu.h>
+#include <linux/types.h>
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
+struct ctl_table;
void __init printk_sysctl_init(void);
int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
@@ -20,6 +21,19 @@ int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
(con->flags & CON_BOOT) ? "boot" : "", \
con->name, con->index, ##__VA_ARGS__)
+/*
+ * Identify if legacy printing is forced in a dedicated kthread. If
+ * true, all printing via console lock occurs within a dedicated
+ * legacy printer thread. The only exception is on panic, after the
+ * nbcon consoles have had their chance to print the panic messages
+ * first.
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define force_legacy_kthread() (true)
+#else
+# define force_legacy_kthread() (false)
+#endif
+
#ifdef CONFIG_PRINTK
#ifdef CONFIG_PRINTK_CALLER
@@ -43,7 +57,11 @@ enum printk_info_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
+struct printk_ringbuffer;
+struct dev_printk_info;
+
extern struct printk_ringbuffer *prb;
+extern bool printk_kthreads_running;
__printf(4, 0)
int vprintk_store(int facility, int level,
@@ -53,6 +71,9 @@ int vprintk_store(int facility, int level,
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
+void __printk_safe_enter(void);
+void __printk_safe_exit(void);
+
bool printk_percpu_data_ready(void);
#define printk_safe_enter_irqsave(flags) \
@@ -68,15 +89,85 @@ bool printk_percpu_data_ready(void);
} while (0)
void defer_console_output(void);
+bool is_printk_legacy_deferred(void);
u16 printk_parse_prefix(const char *text, int *level,
enum printk_info_flags *flags);
+void console_lock_spinning_enable(void);
+int console_lock_spinning_disable_and_check(int cookie);
u64 nbcon_seq_read(struct console *con);
void nbcon_seq_force(struct console *con, u64 seq);
bool nbcon_alloc(struct console *con);
-void nbcon_init(struct console *con);
void nbcon_free(struct console *con);
+enum nbcon_prio nbcon_get_default_prio(void);
+void nbcon_atomic_flush_pending(void);
+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
+ int cookie, bool use_atomic);
+bool nbcon_kthread_create(struct console *con);
+void nbcon_kthread_stop(struct console *con);
+void nbcon_kthreads_wake(void);
+
+/*
+ * Check if the given console is currently capable and allowed to print
+ * records. Note that this function does not consider the current context,
+ * which can also play a role in deciding if @con can be used to print
+ * records.
+ */
+static inline bool console_is_usable(struct console *con, short flags, bool use_atomic)
+{
+ if (!(flags & CON_ENABLED))
+ return false;
+
+ if ((flags & CON_SUSPENDED))
+ return false;
+
+ if (flags & CON_NBCON) {
+ /* The write_atomic() callback is optional. */
+ if (use_atomic && !con->write_atomic)
+ return false;
+
+ /*
+ * For the !use_atomic case, @printk_kthreads_running is not
+ * checked because the write_thread() callback is also used
+ * via the legacy loop when the printer threads are not
+ * available.
+ */
+ } else {
+ if (!con->write)
+ return false;
+ }
+
+ /*
+ * Console drivers may assume that per-cpu resources have been
+ * allocated. So unless they're explicitly marked as being able to
+ * cope (CON_ANYTIME) don't call them until this CPU is officially up.
+ */
+ if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
+ return false;
+
+ return true;
+}
+
+/**
+ * nbcon_kthread_wake - Wake up a console printing thread
+ * @con: Console to operate on
+ */
+static inline void nbcon_kthread_wake(struct console *con)
+{
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the rcuwait is empty.
+ *
+ * The full memory barrier in rcuwait_wake_up() pairs with the full
+ * memory barrier within set_current_state() of
+ * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait()
+ * adds the waiter but before it has checked the wait condition.
+ *
+ * This pairs with nbcon_kthread_func:A.
+ */
+ rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */
+}
#else
@@ -84,6 +175,8 @@ void nbcon_free(struct console *con);
#define PRINTK_MESSAGE_MAX 0
#define PRINTKRB_RECORD_MAX 0
+#define printk_kthreads_running (false)
+
/*
* In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
@@ -93,14 +186,119 @@ void nbcon_free(struct console *con);
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
static inline bool printk_percpu_data_ready(void) { return false; }
+static inline void defer_console_output(void) { }
+static inline bool is_printk_legacy_deferred(void) { return false; }
static inline u64 nbcon_seq_read(struct console *con) { return 0; }
static inline void nbcon_seq_force(struct console *con, u64 seq) { }
static inline bool nbcon_alloc(struct console *con) { return false; }
-static inline void nbcon_init(struct console *con) { }
static inline void nbcon_free(struct console *con) { }
+static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; }
+static inline void nbcon_atomic_flush_pending(void) { }
+static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
+ int cookie, bool use_atomic) { return false; }
+static inline void nbcon_kthread_wake(struct console *con) { }
+static inline void nbcon_kthreads_wake(void) { }
+
+static inline bool console_is_usable(struct console *con, short flags,
+ bool use_atomic) { return false; }
#endif /* CONFIG_PRINTK */
+extern bool have_boot_console;
+extern bool have_nbcon_console;
+extern bool have_legacy_console;
+extern bool legacy_allow_panic_sync;
+
+/**
+ * struct console_flush_type - Define available console flush methods
+ * @nbcon_atomic: Flush directly using nbcon_atomic() callback
+ * @nbcon_offload: Offload flush to printer thread
+ * @legacy_direct: Call the legacy loop in this context
+ * @legacy_offload: Offload the legacy loop into IRQ or legacy thread
+ *
+ * Note that the legacy loop also flushes the nbcon consoles.
+ */
+struct console_flush_type {
+ bool nbcon_atomic;
+ bool nbcon_offload;
+ bool legacy_direct;
+ bool legacy_offload;
+};
+
+/*
+ * Identify which console flushing methods should be used in the context of
+ * the caller.
+ */
+static inline void printk_get_console_flush_type(struct console_flush_type *ft)
+{
+ memset(ft, 0, sizeof(*ft));
+
+ switch (nbcon_get_default_prio()) {
+ case NBCON_PRIO_NORMAL:
+ if (have_nbcon_console && !have_boot_console) {
+ if (printk_kthreads_running)
+ ft->nbcon_offload = true;
+ else
+ ft->nbcon_atomic = true;
+ }
+
+ /* Legacy consoles are flushed directly when possible. */
+ if (have_legacy_console || have_boot_console) {
+ if (!is_printk_legacy_deferred())
+ ft->legacy_direct = true;
+ else
+ ft->legacy_offload = true;
+ }
+ break;
+
+ case NBCON_PRIO_EMERGENCY:
+ if (have_nbcon_console && !have_boot_console)
+ ft->nbcon_atomic = true;
+
+ /* Legacy consoles are flushed directly when possible. */
+ if (have_legacy_console || have_boot_console) {
+ if (!is_printk_legacy_deferred())
+ ft->legacy_direct = true;
+ else
+ ft->legacy_offload = true;
+ }
+ break;
+
+ case NBCON_PRIO_PANIC:
+ /*
+ * In panic, the nbcon consoles will directly print. But
+ * only allowed if there are no boot consoles.
+ */
+ if (have_nbcon_console && !have_boot_console)
+ ft->nbcon_atomic = true;
+
+ if (have_legacy_console || have_boot_console) {
+ /*
+ * This is the same decision as NBCON_PRIO_NORMAL
+ * except that offloading never occurs in panic.
+ *
+ * Note that console_flush_on_panic() will flush
+ * legacy consoles anyway, even if unsafe.
+ */
+ if (!is_printk_legacy_deferred())
+ ft->legacy_direct = true;
+
+ /*
+ * In panic, if nbcon atomic printing occurs,
+ * the legacy consoles must remain silent until
+ * explicitly allowed.
+ */
+ if (ft->nbcon_atomic && !legacy_allow_panic_sync)
+ ft->legacy_direct = false;
+ }
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
extern struct printk_buffers printk_shared_pbufs;
/**
@@ -135,4 +333,5 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
#ifdef CONFIG_PRINTK
void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
+void console_prepend_replay(struct printk_message *pmsg);
#endif
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index c8093bcc01fe..fd12efcc4aed 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -2,11 +2,25 @@
// Copyright (C) 2022 Linutronix GmbH, John Ogness
// Copyright (C) 2022 Intel, Thomas Gleixner
-#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/bug.h>
#include <linux/console.h>
#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kthread.h>
+#include <linux/minmax.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
#include "internal.h"
+#include "printk_ringbuffer.h"
/*
* Printk console printing implementation for consoles which does not depend
* on the legacy style console_lock mechanism.
@@ -172,9 +186,6 @@ void nbcon_seq_force(struct console *con, u64 seq)
u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq));
-
- /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */
- con->seq = 0;
}
/**
@@ -231,6 +242,13 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
struct nbcon_state new;
do {
+ /*
+ * Panic does not imply that the console is owned. However, it
+ * is critical that non-panic CPUs during panic are unable to
+ * acquire ownership in order to satisfy the assumptions of
+ * nbcon_waiter_matches(). In particular, the assumption that
+ * lower priorities are ignored during panic.
+ */
if (other_cpu_in_panic())
return -EPERM;
@@ -262,18 +280,29 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
/*
* The request context is well defined by the @req_prio because:
*
- * - Only a context with a higher priority can take over the request.
+ * - Only a context with a priority higher than the owner can become
+ * a waiter.
+ * - Only a context with a priority higher than the waiter can
+ * directly take over the request.
* - There are only three priorities.
* - Only one CPU is allowed to request PANIC priority.
* - Lower priorities are ignored during panic() until reboot.
*
* As a result, the following scenario is *not* possible:
*
- * 1. Another context with a higher priority directly takes ownership.
- * 2. The higher priority context releases the ownership.
- * 3. A lower priority context takes the ownership.
- * 4. Another context with the same priority as this context
+ * 1. This context is currently a waiter.
+ * 2. Another context with a higher priority than this context
+ * directly takes ownership.
+ * 3. The higher priority context releases the ownership.
+ * 4. Another lower priority context takes the ownership.
+ * 5. Another context with the same priority as this context
* creates a request and starts waiting.
+ *
+ * Event #1 implies this context is EMERGENCY.
+ * Event #2 implies the new context is PANIC.
+ * Event #3 occurs when panic() has flushed the console.
+ * Events #4 and #5 are not possible due to the other_cpu_in_panic()
+ * check in nbcon_context_try_acquire_direct().
*/
return (cur->req_prio == expected_prio);
@@ -531,6 +560,7 @@ static struct printk_buffers panic_nbcon_pbufs;
* nbcon_context_try_acquire - Try to acquire nbcon console
* @ctxt: The context of the caller
*
+ * Context: Under @ctxt->con->device_lock() or local_irq_save().
* Return: True if the console was acquired. False otherwise.
*
* If the caller allowed an unsafe hostile takeover, on success the
@@ -538,7 +568,6 @@ static struct printk_buffers panic_nbcon_pbufs;
* in an unsafe state. Otherwise, on success the caller may assume
* the console is not in an unsafe state.
*/
-__maybe_unused
static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
{
unsigned int cpu = smp_processor_id();
@@ -581,11 +610,29 @@ static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
int expected_prio)
{
/*
- * Since consoles can only be acquired by higher priorities,
- * owning contexts are uniquely identified by @prio. However,
- * since contexts can unexpectedly lose ownership, it is
- * possible that later another owner appears with the same
- * priority. For this reason @cpu is also needed.
+ * A similar function, nbcon_waiter_matches(), only deals with
+ * EMERGENCY and PANIC priorities. However, this function must also
+ * deal with the NORMAL priority, which requires additional checks
+ * and constraints.
+ *
+ * For the case where preemption and interrupts are disabled, it is
+ * enough to also verify that the owning CPU has not changed.
+ *
+ * For the case where preemption or interrupts are enabled, an
+ * external synchronization method *must* be used. In particular,
+ * the driver-specific locking mechanism used in device_lock()
+ * (including disabling migration) should be used. It prevents
+ * scenarios such as:
+ *
+ * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and
+ * is scheduled out.
+ * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY
+ * and releases it.
+ * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X]
+ * and is scheduled out.
+ * 4. [Task A] gets running on [CPU X] and sees that the console is
+ * still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus
+ * [Task A] thinks it is the owner when it is not.
*/
if (cur->prio != expected_prio)
@@ -784,6 +831,19 @@ out:
return nbcon_context_can_proceed(ctxt, &cur);
}
+static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
+ char *buf, unsigned int len)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+
+ wctxt->outbuf = buf;
+ wctxt->len = len;
+ nbcon_state_read(con, &cur);
+ wctxt->unsafe_takeover = cur.unsafe_takeover;
+}
+
/**
* nbcon_enter_unsafe - Enter an unsafe region in the driver
* @wctxt: The write context that was handed to the write function
@@ -799,8 +859,12 @@ out:
bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
{
struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ bool is_owner;
- return nbcon_context_enter_unsafe(ctxt);
+ is_owner = nbcon_context_enter_unsafe(ctxt);
+ if (!is_owner)
+ nbcon_write_context_set_buf(wctxt, NULL, 0);
+ return is_owner;
}
EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);
@@ -819,14 +883,47 @@ EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);
bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
{
struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ bool ret;
- return nbcon_context_exit_unsafe(ctxt);
+ ret = nbcon_context_exit_unsafe(ctxt);
+ if (!ret)
+ nbcon_write_context_set_buf(wctxt, NULL, 0);
+ return ret;
}
EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
/**
+ * nbcon_reacquire_nobuf - Reacquire a console after losing ownership
+ * while printing
+ * @wctxt: The write context that was handed to the write callback
+ *
+ * Since ownership can be lost at any time due to handover or takeover, a
+ * printing context _must_ be prepared to back out immediately and
+ * carefully. However, there are scenarios where the printing context must
+ * reacquire ownership in order to finalize or revert hardware changes.
+ *
+ * This function allows a printing context to reacquire ownership using the
+ * same priority as its previous ownership.
+ *
+ * Note that after a successful reacquire the printing context will have no
+ * output buffer because that has been lost. This function cannot be used to
+ * resume printing.
+ */
+void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ while (!nbcon_context_try_acquire(ctxt))
+ cpu_relax();
+
+ nbcon_write_context_set_buf(wctxt, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf);
+
+/**
* nbcon_emit_next_record - Emit a record in the acquired context
* @wctxt: The write context that will be handed to the write function
+ * @use_atomic: True if the write_atomic() callback is to be used
*
* Return: True if this context still owns the console. False if
* ownership was handed over or taken.
@@ -840,8 +937,7 @@ EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
* When true is returned, @wctxt->ctxt.backlog indicates whether there are
* still records pending in the ringbuffer,
*/
-__maybe_unused
-static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic)
{
struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
struct console *con = ctxt->console;
@@ -852,7 +948,22 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
unsigned long con_dropped;
struct nbcon_state cur;
unsigned long dropped;
- bool done;
+ unsigned long ulseq;
+
+ /*
+ * This function should never be called for consoles that have not
+ * implemented the necessary callback for writing: i.e. legacy
+ * consoles and, when atomic, nbcon consoles with no write_atomic().
+ * Handle it as if ownership was lost and try to continue.
+ *
+ * Note that for nbcon consoles the write_thread() callback is
+ * mandatory and was already checked in nbcon_alloc().
+ */
+ if (WARN_ON_ONCE((use_atomic && !con->write_atomic) ||
+ !(console_srcu_read_flags(con) & CON_NBCON))) {
+ nbcon_context_release(ctxt);
+ return false;
+ }
/*
* The printk buffers are filled within an unsafe section. This
@@ -878,6 +989,29 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
if (dropped && !is_extended)
console_prepend_dropped(&pmsg, dropped);
+ /*
+ * If the previous owner was assigned the same record, this context
+ * has taken over ownership and is replaying the record. Prepend a
+ * message to let the user know the record is replayed.
+ */
+ ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq));
+ if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) {
+ console_prepend_replay(&pmsg);
+ } else {
+ /*
+ * Ensure this context is still the owner before trying to
+ * update @nbcon_prev_seq. Otherwise the value in @ulseq may
+ * not be from the previous owner and instead be some later
+ * value from the context that took over ownership.
+ */
+ nbcon_state_read(con, &cur);
+ if (!nbcon_context_can_proceed(ctxt, &cur))
+ return false;
+
+ atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq,
+ __u64seq_to_ulseq(pmsg.seq));
+ }
+
if (!nbcon_context_exit_unsafe(ctxt))
return false;
@@ -886,22 +1020,27 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
goto update_con;
/* Initialize the write context for driver callbacks. */
- wctxt->outbuf = &pmsg.pbufs->outbuf[0];
- wctxt->len = pmsg.outbuf_len;
- nbcon_state_read(con, &cur);
- wctxt->unsafe_takeover = cur.unsafe_takeover;
+ nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len);
- if (con->write_atomic) {
- done = con->write_atomic(con, wctxt);
- } else {
+ if (use_atomic)
+ con->write_atomic(con, wctxt);
+ else
+ con->write_thread(con, wctxt);
+
+ if (!wctxt->outbuf) {
+ /*
+ * Ownership was lost and reacquired by the driver. Handle it
+ * as if ownership was lost.
+ */
nbcon_context_release(ctxt);
- WARN_ON_ONCE(1);
- done = false;
+ return false;
}
- /* If not done, the emit was aborted. */
- if (!done)
- return false;
+ /*
+ * Ownership may have been lost but _not_ reacquired by the driver.
+ * This case is detected and handled when entering unsafe to update
+ * dropped/seq values.
+ */
/*
* Since any dropped message was successfully output, reset the
@@ -928,54 +1067,650 @@ update_con:
return nbcon_context_exit_unsafe(ctxt);
}
+/*
+ * nbcon_emit_one - Print one record for an nbcon console using the
+ * specified callback
+ * @wctxt: An initialized write context struct to use for this context
+ * @use_atomic: True if the write_atomic() callback is to be used
+ *
+ * Return: True, when a record has been printed and there are still
+ * pending records. The caller might want to continue flushing.
+ *
+ * False, when there is no pending record, or when the console
+ * context cannot be acquired, or the ownership has been lost.
+ * The caller should give up. Either the job is done, cannot be
+ * done, or will be handled by the owning context.
+ *
+ * This is an internal helper to handle the locking of the console before
+ * calling nbcon_emit_next_record().
+ */
+static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ unsigned long flags;
+ bool ret = false;
+
+ if (!use_atomic) {
+ con->device_lock(con, &flags);
+
+ /*
+ * Ensure this stays on the CPU to make handover and
+ * takeover possible.
+ */
+ cant_migrate();
+ }
+
+ if (!nbcon_context_try_acquire(ctxt))
+ goto out;
+
+ /*
+ * nbcon_emit_next_record() returns false when the console was
+ * handed over or taken over. In both cases the context is no
+ * longer valid.
+ *
+ * The higher priority printing context takes over responsibility
+ * to print the pending records.
+ */
+ if (!nbcon_emit_next_record(wctxt, use_atomic))
+ goto out;
+
+ nbcon_context_release(ctxt);
+
+ ret = ctxt->backlog;
+out:
+ if (!use_atomic)
+ con->device_unlock(con, flags);
+ return ret;
+}
+
/**
- * nbcon_alloc - Allocate buffers needed by the nbcon console
- * @con: Console to allocate buffers for
+ * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup
+ * @con: Console to operate on
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
*
- * Return: True on success. False otherwise and the console cannot
- * be used.
+ * Return: True if the thread should shutdown or if the console is
+ * allowed to print and a record is available. False otherwise.
*
- * This is not part of nbcon_init() because buffer allocation must
- * be performed earlier in the console registration process.
+ * After the thread wakes up, it must first check if it should shutdown before
+ * attempting any printing.
*/
-bool nbcon_alloc(struct console *con)
+static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt)
{
- if (con->flags & CON_BOOT) {
+ bool ret = false;
+ short flags;
+ int cookie;
+
+ if (kthread_should_stop())
+ return true;
+
+ cookie = console_srcu_read_lock();
+
+ flags = console_srcu_read_flags(con);
+ if (console_is_usable(con, flags, false)) {
+ /* Bring the sequence in @ctxt up to date */
+ ctxt->seq = nbcon_seq_read(con);
+
+ ret = prb_read_valid(prb, ctxt->seq, NULL);
+ }
+
+ console_srcu_read_unlock(cookie);
+ return ret;
+}
+
+/**
+ * nbcon_kthread_func - The printer thread function
+ * @__console: Console to operate on
+ *
+ * Return: 0
+ */
+static int nbcon_kthread_func(void *__console)
+{
+ struct console *con = __console;
+ struct nbcon_write_context wctxt = {
+ .ctxt.console = con,
+ .ctxt.prio = NBCON_PRIO_NORMAL,
+ };
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+ short con_flags;
+ bool backlog;
+ int cookie;
+
+wait_for_event:
+ /*
+ * Guarantee this task is visible on the rcuwait before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * ___rcuwait_wait_event() pairs with the full memory
+ * barrier within rcuwait_has_sleeper().
+ *
+ * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A.
+ */
+ rcuwait_wait_event(&con->rcuwait,
+ nbcon_kthread_should_wakeup(con, ctxt),
+ TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */
+
+ do {
+ if (kthread_should_stop())
+ return 0;
+
+ backlog = false;
+
/*
- * Boot console printing is synchronized with legacy console
- * printing, so boot consoles can share the same global printk
- * buffers.
+ * Keep the srcu read lock around the entire operation so that
+ * synchronize_srcu() can guarantee that the kthread stopped
+ * or suspended printing.
*/
- con->pbufs = &printk_shared_pbufs;
+ cookie = console_srcu_read_lock();
+
+ con_flags = console_srcu_read_flags(con);
+
+ if (console_is_usable(con, con_flags, false))
+ backlog = nbcon_emit_one(&wctxt, false);
+
+ console_srcu_read_unlock(cookie);
+
+ cond_resched();
+
+ } while (backlog);
+
+ goto wait_for_event;
+}
+
+/**
+ * nbcon_irq_work - irq work to wake console printer thread
+ * @irq_work: The irq work to operate on
+ */
+static void nbcon_irq_work(struct irq_work *irq_work)
+{
+ struct console *con = container_of(irq_work, struct console, irq_work);
+
+ nbcon_kthread_wake(con);
+}
+
+static inline bool rcuwait_has_sleeper(struct rcuwait *w)
+{
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the rcuwait is empty.
+ *
+ * This full memory barrier pairs with the full memory barrier within
+ * set_current_state() of ___rcuwait_wait_event(), which is called
+ * after prepare_to_rcuwait() adds the waiter but before it has
+ * checked the wait condition.
+ *
+ * This pairs with nbcon_kthread_func:A.
+ */
+ smp_mb(); /* LMM(rcuwait_has_sleeper:A) */
+ return rcuwait_active(w);
+}
+
+/**
+ * nbcon_kthreads_wake - Wake up printing threads using irq_work
+ */
+void nbcon_kthreads_wake(void)
+{
+ struct console *con;
+ int cookie;
+
+ if (!printk_kthreads_running)
+ return;
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ if (!(console_srcu_read_flags(con) & CON_NBCON))
+ continue;
+
+ /*
+ * Only schedule irq_work if the printing thread is
+ * actively waiting. If not waiting, the thread will
+ * notice by itself that it has work to do.
+ */
+ if (rcuwait_has_sleeper(&con->rcuwait))
+ irq_work_queue(&con->irq_work);
+ }
+ console_srcu_read_unlock(cookie);
+}
+
+/*
+ * nbcon_kthread_stop - Stop a console printer thread
+ * @con: Console to operate on
+ */
+void nbcon_kthread_stop(struct console *con)
+{
+ lockdep_assert_console_list_lock_held();
+
+ if (!con->kthread)
+ return;
+
+ kthread_stop(con->kthread);
+ con->kthread = NULL;
+}
+
+/**
+ * nbcon_kthread_create - Create a console printer thread
+ * @con: Console to operate on
+ *
+ * Return: True if the kthread was started or already exists.
+ * Otherwise false and @con must not be registered.
+ *
+ * This function is called when it will be expected that nbcon consoles are
+ * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL
+ * will be no longer flushed by the legacy loop. This is why failure must
+ * be fatal for console registration.
+ *
+ * If @con was already registered and this function fails, @con must be
+ * unregistered before the global state variable @printk_kthreads_running
+ * can be set.
+ */
+bool nbcon_kthread_create(struct console *con)
+{
+ struct task_struct *kt;
+
+ lockdep_assert_console_list_lock_held();
+
+ if (con->kthread)
+ return true;
+
+ kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index);
+ if (WARN_ON(IS_ERR(kt))) {
+ con_printk(KERN_ERR, con, "failed to start printing thread\n");
+ return false;
+ }
+
+ con->kthread = kt;
+
+ /*
+ * It is important that console printing threads are scheduled
+ * shortly after a printk call and with generous runtime budgets.
+ */
+ sched_set_normal(con->kthread, -20);
+
+ return true;
+}
+
+/* Track the nbcon emergency nesting per CPU. */
+static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting);
+static unsigned int early_nbcon_pcpu_emergency_nesting __initdata;
+
+/**
+ * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer
+ *
+ * Context: For reading, any context. For writing, any context which could
+ * not be migrated to another CPU.
+ * Return: Either a pointer to the per CPU emergency nesting counter of
+ * the current CPU or to the init data during early boot.
+ *
+ * The function is safe for reading per-CPU variables in any context because
+ * preemption is disabled if the current CPU is in the emergency state. See
+ * also nbcon_cpu_emergency_enter().
+ */
+static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void)
+{
+ /*
+ * The value of __printk_percpu_data_ready gets set in normal
+ * context and before SMP initialization. As a result it could
+ * never change while inside an nbcon emergency section.
+ */
+ if (!printk_percpu_data_ready())
+ return &early_nbcon_pcpu_emergency_nesting;
+
+ return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting);
+}
+
+/**
+ * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon
+ * printing on the current CPU
+ *
+ * Context: Any context.
+ * Return: The nbcon_prio to use for acquiring an nbcon console in this
+ * context for printing.
+ *
+ * The function is safe for reading per-CPU data in any context because
+ * preemption is disabled if the current CPU is in the emergency or panic
+ * state.
+ */
+enum nbcon_prio nbcon_get_default_prio(void)
+{
+ unsigned int *cpu_emergency_nesting;
+
+ if (this_cpu_in_panic())
+ return NBCON_PRIO_PANIC;
+
+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
+ if (*cpu_emergency_nesting)
+ return NBCON_PRIO_EMERGENCY;
+
+ return NBCON_PRIO_NORMAL;
+}
+
+/**
+ * nbcon_legacy_emit_next_record - Print one record for an nbcon console
+ * in legacy contexts
+ * @con: The console to print on
+ * @handover: Will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding
+ * both the console_lock and the SRCU read lock. Otherwise it
+ * is set to false.
+ * @cookie: The cookie from the SRCU read lock.
+ * @use_atomic: Set true when called in an atomic or unknown context.
+ * It affects which nbcon callback will be used: write_atomic()
+ * or write_thread().
+ *
+ * When false, the write_thread() callback is used and would be
+ * called in a preemtible context unless disabled by the
+ * device_lock. The legacy handover is not allowed in this mode.
+ *
+ * Context: Any context except NMI.
+ * Return: True, when a record has been printed and there are still
+ * pending records. The caller might want to continue flushing.
+ *
+ * False, when there is no pending record, or when the console
+ * context cannot be acquired, or the ownership has been lost.
+ * The caller should give up. Either the job is done, cannot be
+ * done, or will be handled by the owning context.
+ *
+ * This function is meant to be called by console_flush_all() to print records
+ * on nbcon consoles from legacy context (printing via console unlocking).
+ * Essentially it is the nbcon version of console_emit_next_record().
+ */
+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
+ int cookie, bool use_atomic)
+{
+ struct nbcon_write_context wctxt = { };
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+ unsigned long flags;
+ bool progress;
+
+ ctxt->console = con;
+ ctxt->prio = nbcon_get_default_prio();
+
+ if (use_atomic) {
+ /*
+ * In an atomic or unknown context, use the same procedure as
+ * in console_emit_next_record(). It allows to handover.
+ */
+ printk_safe_enter_irqsave(flags);
+ console_lock_spinning_enable();
+ stop_critical_timings();
+ }
+
+ progress = nbcon_emit_one(&wctxt, use_atomic);
+
+ if (use_atomic) {
+ start_critical_timings();
+ *handover = console_lock_spinning_disable_and_check(cookie);
+ printk_safe_exit_irqrestore(flags);
} else {
- con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
- if (!con->pbufs) {
- con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
- return false;
+ /* Non-atomic does not perform legacy spinning handovers. */
+ *handover = false;
+ }
+
+ return progress;
+}
+
+/**
+ * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
+ * write_atomic() callback
+ * @con: The nbcon console to flush
+ * @stop_seq: Flush up until this record
+ * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers
+ *
+ * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on
+ * failure.
+ *
+ * Errors:
+ *
+ * -EPERM: Unable to acquire console ownership.
+ *
+ * -EAGAIN: Another context took over ownership while printing.
+ *
+ * -ENOENT: A record before @stop_seq is not available.
+ *
+ * If flushing up to @stop_seq was not successful, it only makes sense for the
+ * caller to try again when -EAGAIN was returned. When -EPERM is returned,
+ * this context is not allowed to acquire the console. When -ENOENT is
+ * returned, it cannot be expected that the unfinalized record will become
+ * available.
+ */
+static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
+ bool allow_unsafe_takeover)
+{
+ struct nbcon_write_context wctxt = { };
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+ int err = 0;
+
+ ctxt->console = con;
+ ctxt->spinwait_max_us = 2000;
+ ctxt->prio = nbcon_get_default_prio();
+ ctxt->allow_unsafe_takeover = allow_unsafe_takeover;
+
+ if (!nbcon_context_try_acquire(ctxt))
+ return -EPERM;
+
+ while (nbcon_seq_read(con) < stop_seq) {
+ /*
+ * nbcon_emit_next_record() returns false when the console was
+ * handed over or taken over. In both cases the context is no
+ * longer valid.
+ */
+ if (!nbcon_emit_next_record(&wctxt, true))
+ return -EAGAIN;
+
+ if (!ctxt->backlog) {
+ /* Are there reserved but not yet finalized records? */
+ if (nbcon_seq_read(con) < stop_seq)
+ err = -ENOENT;
+ break;
}
}
- return true;
+ nbcon_context_release(ctxt);
+ return err;
+}
+
+/**
+ * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
+ * write_atomic() callback
+ * @con: The nbcon console to flush
+ * @stop_seq: Flush up until this record
+ * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers
+ *
+ * This will stop flushing before @stop_seq if another context has ownership.
+ * That context is then responsible for the flushing. Likewise, if new records
+ * are added while this context was flushing and there is no other context
+ * to handle the printing, this context must also flush those records.
+ */
+static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
+ bool allow_unsafe_takeover)
+{
+ struct console_flush_type ft;
+ unsigned long flags;
+ int err;
+
+again:
+ /*
+ * Atomic flushing does not use console driver synchronization (i.e.
+ * it does not hold the port lock for uart consoles). Therefore IRQs
+ * must be disabled to avoid being interrupted and then calling into
+ * a driver that will deadlock trying to acquire console ownership.
+ */
+ local_irq_save(flags);
+
+ err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);
+
+ local_irq_restore(flags);
+
+ /*
+ * If there was a new owner (-EPERM, -EAGAIN), that context is
+ * responsible for completing.
+ *
+ * Do not wait for records not yet finalized (-ENOENT) to avoid a
+ * possible deadlock. They will either get flushed by the writer or
+ * eventually skipped on panic CPU.
+ */
+ if (err)
+ return;
+
+ /*
+ * If flushing was successful but more records are available, this
+ * context must flush those remaining records if the printer thread
+ * is not available do it.
+ */
+ printk_get_console_flush_type(&ft);
+ if (!ft.nbcon_offload &&
+ prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
+ stop_seq = prb_next_reserve_seq(prb);
+ goto again;
+ }
+}
+
+/**
+ * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their
+ * write_atomic() callback
+ * @stop_seq: Flush up until this record
+ * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers
+ */
+static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover)
+{
+ struct console *con;
+ int cookie;
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+
+ if (!(flags & CON_NBCON))
+ continue;
+
+ if (!console_is_usable(con, flags, true))
+ continue;
+
+ if (nbcon_seq_read(con) >= stop_seq)
+ continue;
+
+ nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);
+ }
+ console_srcu_read_unlock(cookie);
+}
+
+/**
+ * nbcon_atomic_flush_pending - Flush all nbcon consoles using their
+ * write_atomic() callback
+ *
+ * Flush the backlog up through the currently newest record. Any new
+ * records added while flushing will not be flushed if there is another
+ * context available to handle the flushing. This is to avoid one CPU
+ * printing unbounded because other CPUs continue to add records.
+ */
+void nbcon_atomic_flush_pending(void)
+{
+ __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false);
+}
+
+/**
+ * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their
+ * write_atomic() callback and allowing unsafe hostile takeovers
+ *
+ * Flush the backlog up through the currently newest record. Unsafe hostile
+ * takeovers will be performed, if necessary.
+ */
+void nbcon_atomic_flush_unsafe(void)
+{
+ __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true);
+}
+
+/**
+ * nbcon_cpu_emergency_enter - Enter an emergency section where printk()
+ * messages for that CPU are flushed directly
+ *
+ * Context: Any context. Disables preemption.
+ *
+ * When within an emergency section, printk() calls will attempt to flush any
+ * pending messages in the ringbuffer.
+ */
+void nbcon_cpu_emergency_enter(void)
+{
+ unsigned int *cpu_emergency_nesting;
+
+ preempt_disable();
+
+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
+ (*cpu_emergency_nesting)++;
+}
+
+/**
+ * nbcon_cpu_emergency_exit - Exit an emergency section
+ *
+ * Context: Within an emergency section. Enables preemption.
+ */
+void nbcon_cpu_emergency_exit(void)
+{
+ unsigned int *cpu_emergency_nesting;
+
+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
+
+ if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0))
+ (*cpu_emergency_nesting)--;
+
+ preempt_enable();
}
/**
- * nbcon_init - Initialize the nbcon console specific data
+ * nbcon_alloc - Allocate and init the nbcon console specific data
* @con: Console to initialize
*
- * nbcon_alloc() *must* be called and succeed before this function
- * is called.
+ * Return: True if the console was fully allocated and initialized.
+ * Otherwise @con must not be registered.
*
- * This function expects that the legacy @con->seq has been set.
+ * When allocation and init was successful, the console must be properly
+ * freed using nbcon_free() once it is no longer needed.
*/
-void nbcon_init(struct console *con)
+bool nbcon_alloc(struct console *con)
{
struct nbcon_state state = { };
- /* nbcon_alloc() must have been called and successful! */
- BUG_ON(!con->pbufs);
+ /* The write_thread() callback is mandatory. */
+ if (WARN_ON(!con->write_thread))
+ return false;
- nbcon_seq_force(con, con->seq);
+ rcuwait_init(&con->rcuwait);
+ init_irq_work(&con->irq_work, nbcon_irq_work);
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL);
nbcon_state_set(con, &state);
+
+ /*
+ * Initialize @nbcon_seq to the highest possible sequence number so
+ * that practically speaking it will have nothing to print until a
+ * desired initial sequence number has been set via nbcon_seq_force().
+ */
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb));
+
+ if (con->flags & CON_BOOT) {
+ /*
+ * Boot console printing is synchronized with legacy console
+ * printing, so boot consoles can share the same global printk
+ * buffers.
+ */
+ con->pbufs = &printk_shared_pbufs;
+ } else {
+ con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
+ if (!con->pbufs) {
+ con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
+ return false;
+ }
+
+ if (printk_kthreads_running) {
+ if (!nbcon_kthread_create(con)) {
+ kfree(con->pbufs);
+ con->pbufs = NULL;
+ return false;
+ }
+ }
+ }
+
+ return true;
}
/**
@@ -986,6 +1721,9 @@ void nbcon_free(struct console *con)
{
struct nbcon_state state = { };
+ if (printk_kthreads_running)
+ nbcon_kthread_stop(con);
+
nbcon_state_set(con, &state);
/* Boot consoles share global printk buffers. */
@@ -994,3 +1732,85 @@ void nbcon_free(struct console *con)
con->pbufs = NULL;
}
+
+/**
+ * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe
+ * section
+ * @con: The nbcon console to acquire
+ *
+ * Context: Under the locking mechanism implemented in
+ * @con->device_lock() including disabling migration.
+ * Return: True if the console was acquired. False otherwise.
+ *
+ * Console drivers will usually use their own internal synchronization
+ * mechasism to synchronize between console printing and non-printing
+ * activities (such as setting baud rates). However, nbcon console drivers
+ * supporting atomic consoles may also want to mark unsafe sections when
+ * performing non-printing activities in order to synchronize against their
+ * atomic_write() callback.
+ *
+ * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL
+ * and marks it unsafe for handover/takeover.
+ */
+bool nbcon_device_try_acquire(struct console *con)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);
+
+ cant_migrate();
+
+ memset(ctxt, 0, sizeof(*ctxt));
+ ctxt->console = con;
+ ctxt->prio = NBCON_PRIO_NORMAL;
+
+ if (!nbcon_context_try_acquire(ctxt))
+ return false;
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(nbcon_device_try_acquire);
+
+/**
+ * nbcon_device_release - Exit unsafe section and release the nbcon console
+ * @con: The nbcon console acquired in nbcon_device_try_acquire()
+ */
+void nbcon_device_release(struct console *con)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);
+ struct console_flush_type ft;
+ int cookie;
+
+ if (!nbcon_context_exit_unsafe(ctxt))
+ return;
+
+ nbcon_context_release(ctxt);
+
+ /*
+ * This context must flush any new records added while the console
+ * was locked if the printer thread is not available to do it. The
+ * console_srcu_read_lock must be taken to ensure the console is
+ * usable throughout flushing.
+ */
+ cookie = console_srcu_read_lock();
+ printk_get_console_flush_type(&ft);
+ if (console_is_usable(con, console_srcu_read_flags(con), true) &&
+ !ft.nbcon_offload &&
+ prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
+ /*
+ * If nbcon_atomic flushing is not available, fallback to
+ * using the legacy loop.
+ */
+ if (ft.nbcon_atomic) {
+ __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false);
+ } else if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
+ } else if (ft.legacy_offload) {
+ printk_trigger_flush();
+ }
+ }
+ console_srcu_read_unlock(cookie);
+}
+EXPORT_SYMBOL_GPL(nbcon_device_release);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 054c0e7784fd..71e4fe6f9b85 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -34,6 +34,7 @@
#include <linux/security.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
+#include <linux/syscore_ops.h>
#include <linux/vmcore_info.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
@@ -282,6 +283,7 @@ EXPORT_SYMBOL(console_list_unlock);
* Return: A cookie to pass to console_srcu_read_unlock().
*/
int console_srcu_read_lock(void)
+ __acquires(&console_srcu)
{
return srcu_read_lock_nmisafe(&console_srcu);
}
@@ -295,6 +297,7 @@ EXPORT_SYMBOL(console_srcu_read_lock);
* Counterpart to console_srcu_read_lock()
*/
void console_srcu_read_unlock(int cookie)
+ __releases(&console_srcu)
{
srcu_read_unlock_nmisafe(&console_srcu, cookie);
}
@@ -461,14 +464,43 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_MUTEX(syslog_lock);
+/*
+ * Specifies if a legacy console is registered. If legacy consoles are
+ * present, it is necessary to perform the console lock/unlock dance
+ * whenever console flushing should occur.
+ */
+bool have_legacy_console;
+
+/*
+ * Specifies if an nbcon console is registered. If nbcon consoles are present,
+ * synchronous printing of legacy consoles will not occur during panic until
+ * the backtrace has been stored to the ringbuffer.
+ */
+bool have_nbcon_console;
+
+/*
+ * Specifies if a boot console is registered. If boot consoles are present,
+ * nbcon consoles cannot print simultaneously and must be synchronized by
+ * the console lock. This is because boot consoles and nbcon consoles may
+ * have mapped the same hardware.
+ */
+bool have_boot_console;
+
+/* See printk_legacy_allow_panic_sync() for details. */
+bool legacy_allow_panic_sync;
+
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
+static DECLARE_WAIT_QUEUE_HEAD(legacy_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
+/* True when _all_ printer threads are available for printing. */
+bool printk_kthreads_running;
+
struct latched_seq {
seqcount_latch_t latch;
u64 val[2];
@@ -1850,7 +1882,7 @@ static bool console_waiter;
* there may be a waiter spinning (like a spinlock). Also it must be
* ready to hand over the lock at the end of the section.
*/
-static void console_lock_spinning_enable(void)
+void console_lock_spinning_enable(void)
{
/*
* Do not use spinning in panic(). The panic CPU wants to keep the lock.
@@ -1889,7 +1921,7 @@ lockdep:
*
* Return: 1 if the lock rights were passed, 0 otherwise.
*/
-static int console_lock_spinning_disable_and_check(int cookie)
+int console_lock_spinning_disable_and_check(int cookie)
{
int waiter;
@@ -2300,12 +2332,30 @@ out:
return ret;
}
+/*
+ * This acts as a one-way switch to allow legacy consoles to print from
+ * the printk() caller context on a panic CPU. It also attempts to flush
+ * the legacy consoles in this context.
+ */
+void printk_legacy_allow_panic_sync(void)
+{
+ struct console_flush_type ft;
+
+ legacy_allow_panic_sync = true;
+
+ printk_get_console_flush_type(&ft);
+ if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
+ }
+}
+
asmlinkage int vprintk_emit(int facility, int level,
const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
+ struct console_flush_type ft;
int printed_len;
- bool in_sched = false;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
@@ -2316,20 +2366,29 @@ asmlinkage int vprintk_emit(int facility, int level,
* non-panic CPUs are generating any messages, they will be
* silently dropped.
*/
- if (other_cpu_in_panic())
+ if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace)
return 0;
+ printk_get_console_flush_type(&ft);
+
+ /* If called from the scheduler, we can not call up(). */
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
- in_sched = true;
+ ft.legacy_offload |= ft.legacy_direct;
+ ft.legacy_direct = false;
}
printk_delay(level);
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
- /* If called from the scheduler, we can not call up(). */
- if (!in_sched) {
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+
+ if (ft.legacy_direct) {
/*
* The caller may be holding system-critical or
* timing-sensitive locks. Disable preemption during
@@ -2349,7 +2408,7 @@ asmlinkage int vprintk_emit(int facility, int level,
preempt_enable();
}
- if (in_sched)
+ if (ft.legacy_offload)
defer_console_output();
else
wake_up_klogd();
@@ -2620,6 +2679,7 @@ int match_devname_and_update_preferred_console(const char *devname,
return -ENOENT;
}
+EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console);
bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);
@@ -2677,6 +2737,7 @@ void suspend_console(void)
void resume_console(void)
{
+ struct console_flush_type ft;
struct console *con;
if (!console_suspend_enabled)
@@ -2694,6 +2755,12 @@ void resume_console(void)
*/
synchronize_srcu(&console_srcu);
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+ if (ft.legacy_offload)
+ defer_console_output();
+
pr_flush(1000, true);
}
@@ -2708,10 +2775,16 @@ void resume_console(void)
*/
static int console_cpu_notify(unsigned int cpu)
{
+ struct console_flush_type ft;
+
if (!cpuhp_tasks_frozen) {
- /* If trylock fails, someone else is doing the printing */
- if (console_trylock())
- console_unlock();
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
+ }
}
return 0;
}
@@ -2765,36 +2838,6 @@ int is_console_locked(void)
}
EXPORT_SYMBOL(is_console_locked);
-/*
- * Check if the given console is currently capable and allowed to print
- * records.
- *
- * Requires the console_srcu_read_lock.
- */
-static inline bool console_is_usable(struct console *con)
-{
- short flags = console_srcu_read_flags(con);
-
- if (!(flags & CON_ENABLED))
- return false;
-
- if ((flags & CON_SUSPENDED))
- return false;
-
- if (!con->write)
- return false;
-
- /*
- * Console drivers may assume that per-cpu resources have been
- * allocated. So unless they're explicitly marked as being able to
- * cope (CON_ANYTIME) don't call them until this CPU is officially up.
- */
- if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
- return false;
-
- return true;
-}
-
static void __console_unlock(void)
{
console_locked = 0;
@@ -2804,30 +2847,31 @@ static void __console_unlock(void)
#ifdef CONFIG_PRINTK
/*
- * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This
- * is achieved by shifting the existing message over and inserting the dropped
- * message.
+ * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting
+ * the existing message over and inserting the scratchbuf message.
*
- * @pmsg is the printk message to prepend.
+ * @pmsg is the original printk message.
+ * @fmt is the printf format of the message which will prepend the existing one.
*
- * @dropped is the dropped count to report in the dropped message.
- *
- * If the message text in @pmsg->pbufs->outbuf does not have enough space for
- * the dropped message, the message text will be sufficiently truncated.
+ * If there is not enough space in @pmsg->pbufs->outbuf, the existing
+ * message text will be sufficiently truncated.
*
* If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
*/
-void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+__printf(2, 3)
+static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...)
{
struct printk_buffers *pbufs = pmsg->pbufs;
const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
const size_t outbuf_sz = sizeof(pbufs->outbuf);
char *scratchbuf = &pbufs->scratchbuf[0];
char *outbuf = &pbufs->outbuf[0];
+ va_list args;
size_t len;
- len = scnprintf(scratchbuf, scratchbuf_sz,
- "** %lu printk messages dropped **\n", dropped);
+ va_start(args, fmt);
+ len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args);
+ va_end(args);
/*
* Make sure outbuf is sufficiently large before prepending.
@@ -2850,6 +2894,30 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
}
/*
+ * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message".
+ * @pmsg->outbuf_len is updated appropriately.
+ *
+ * @pmsg is the printk message to prepend.
+ *
+ * @dropped is the dropped count to report in the dropped message.
+ */
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+{
+ console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped);
+}
+
+/*
+ * Prepend the message in @pmsg->pbufs->outbuf with a "replay message".
+ * @pmsg->outbuf_len is updated appropriately.
+ *
+ * @pmsg is the printk message to prepend.
+ */
+void console_prepend_replay(struct printk_message *pmsg)
+{
+ console_prepend_message(pmsg, "** replaying previous printk message **\n");
+}
+
+/*
* Read and format the specified record (or a later record if the specified
* record is not available).
*
@@ -2915,6 +2983,34 @@ out:
}
/*
+ * Legacy console printing from printk() caller context does not respect
+ * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a
+ * false positive. For PREEMPT_RT the false positive condition does not
+ * occur.
+ *
+ * This map is used to temporarily establish LD_WAIT_SLEEP context for the
+ * console write() callback when legacy printing to avoid false positive
+ * lockdep complaints, thus allowing lockdep to continue to function for
+ * real issues.
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline void printk_legacy_allow_spinlock_enter(void) { }
+static inline void printk_legacy_allow_spinlock_exit(void) { }
+#else
+static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP);
+
+static inline void printk_legacy_allow_spinlock_enter(void)
+{
+ lock_map_acquire_try(&printk_legacy_map);
+}
+
+static inline void printk_legacy_allow_spinlock_exit(void)
+{
+ lock_map_release(&printk_legacy_map);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+/*
* Used as the printk buffers for non-panic, serialized console printing.
* This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
* Its usage requires the console_lock held.
@@ -2963,31 +3059,46 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co
con->dropped = 0;
}
- /*
- * While actively printing out messages, if another printk()
- * were to occur on another CPU, it may wait for this one to
- * finish. This task can not be preempted if there is a
- * waiter waiting to take over.
- *
- * Interrupts are disabled because the hand over to a waiter
- * must not be interrupted until the hand over is completed
- * (@console_waiter is cleared).
- */
- printk_safe_enter_irqsave(flags);
- console_lock_spinning_enable();
+ /* Write everything out to the hardware. */
- /* Do not trace print latency. */
- stop_critical_timings();
+ if (force_legacy_kthread() && !panic_in_progress()) {
+ /*
+ * With forced threading this function is in a task context
+ * (either legacy kthread or get_init_console_seq()). There
+ * is no need for concern about printk reentrance, handovers,
+ * or lockdep complaints.
+ */
- /* Write everything out to the hardware. */
- con->write(con, outbuf, pmsg.outbuf_len);
+ con->write(con, outbuf, pmsg.outbuf_len);
+ con->seq = pmsg.seq + 1;
+ } else {
+ /*
+ * While actively printing out messages, if another printk()
+ * were to occur on another CPU, it may wait for this one to
+ * finish. This task can not be preempted if there is a
+ * waiter waiting to take over.
+ *
+ * Interrupts are disabled because the hand over to a waiter
+ * must not be interrupted until the hand over is completed
+ * (@console_waiter is cleared).
+ */
+ printk_safe_enter_irqsave(flags);
+ console_lock_spinning_enable();
- start_critical_timings();
+ /* Do not trace print latency. */
+ stop_critical_timings();
- con->seq = pmsg.seq + 1;
+ printk_legacy_allow_spinlock_enter();
+ con->write(con, outbuf, pmsg.outbuf_len);
+ printk_legacy_allow_spinlock_exit();
- *handover = console_lock_spinning_disable_and_check(cookie);
- printk_safe_exit_irqrestore(flags);
+ start_critical_timings();
+
+ con->seq = pmsg.seq + 1;
+
+ *handover = console_lock_spinning_disable_and_check(cookie);
+ printk_safe_exit_irqrestore(flags);
+ }
skip:
return true;
}
@@ -3000,6 +3111,8 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co
return false;
}
+static inline void printk_kthreads_check_locked(void) { }
+
#endif /* CONFIG_PRINTK */
/*
@@ -3027,6 +3140,7 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co
*/
static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
{
+ struct console_flush_type ft;
bool any_usable = false;
struct console *con;
bool any_progress;
@@ -3038,15 +3152,34 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
do {
any_progress = false;
+ printk_get_console_flush_type(&ft);
+
cookie = console_srcu_read_lock();
for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+ u64 printk_seq;
bool progress;
- if (!console_is_usable(con))
+ /*
+ * console_flush_all() is only responsible for nbcon
+ * consoles when the nbcon consoles cannot print via
+ * their atomic or threaded flushing.
+ */
+ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
+ continue;
+
+ if (!console_is_usable(con, flags, !do_cond_resched))
continue;
any_usable = true;
- progress = console_emit_next_record(con, handover, cookie);
+ if (flags & CON_NBCON) {
+ progress = nbcon_legacy_emit_next_record(con, handover, cookie,
+ !do_cond_resched);
+ printk_seq = nbcon_seq_read(con);
+ } else {
+ progress = console_emit_next_record(con, handover, cookie);
+ printk_seq = con->seq;
+ }
/*
* If a handover has occurred, the SRCU read lock
@@ -3056,8 +3189,8 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
return false;
/* Track the next of the highest seq flushed. */
- if (con->seq > *next_seq)
- *next_seq = con->seq;
+ if (printk_seq > *next_seq)
+ *next_seq = printk_seq;
if (!progress)
continue;
@@ -3080,19 +3213,7 @@ abandon:
return false;
}
-/**
- * console_unlock - unblock the console subsystem from printing
- *
- * Releases the console_lock which the caller holds to block printing of
- * the console subsystem.
- *
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
- *
- * console_unlock(); may be called from any context.
- */
-void console_unlock(void)
+static void __console_flush_and_unlock(void)
{
bool do_cond_resched;
bool handover;
@@ -3136,6 +3257,29 @@ void console_unlock(void)
*/
} while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
}
+
+/**
+ * console_unlock - unblock the legacy console subsystem from printing
+ *
+ * Releases the console_lock which the caller holds to block printing of
+ * the legacy console subsystem.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock() emits the output on
+ * legacy consoles prior to releasing the lock.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+ struct console_flush_type ft;
+
+ printk_get_console_flush_type(&ft);
+ if (ft.legacy_direct)
+ __console_flush_and_unlock();
+ else
+ __console_unlock();
+}
EXPORT_SYMBOL(console_unlock);
/**
@@ -3258,6 +3402,7 @@ static void __console_rewind_all(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
+ struct console_flush_type ft;
bool handover;
u64 next_seq;
@@ -3281,7 +3426,13 @@ void console_flush_on_panic(enum con_flush_mode mode)
if (mode == CONSOLE_REPLAY_ALL)
__console_rewind_all();
- console_flush_all(false, &next_seq, &handover);
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+
+ /* Flush legacy consoles once allowed, even when dangerous. */
+ if (legacy_allow_panic_sync)
+ console_flush_all(false, &next_seq, &handover);
}
/*
@@ -3338,13 +3489,236 @@ EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
+ struct console_flush_type ft;
+ bool is_nbcon;
+
console_list_lock();
console_srcu_write_flags(console, console->flags | CON_ENABLED);
+ is_nbcon = console->flags & CON_NBCON;
console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. The related
+ * printing context must be able to see it is enabled so that
+ * it is guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+
+ printk_get_console_flush_type(&ft);
+ if (is_nbcon && ft.nbcon_offload)
+ nbcon_kthread_wake(console);
+ else if (ft.legacy_offload)
+ defer_console_output();
+
__pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_start);
+#ifdef CONFIG_PRINTK
+static int unregister_console_locked(struct console *console);
+
+/* True when system boot is far enough to create printer threads. */
+static bool printk_kthreads_ready __ro_after_init;
+
+static struct task_struct *printk_legacy_kthread;
+
+static bool legacy_kthread_should_wakeup(void)
+{
+ struct console_flush_type ft;
+ struct console *con;
+ bool ret = false;
+ int cookie;
+
+ if (kthread_should_stop())
+ return true;
+
+ printk_get_console_flush_type(&ft);
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+ u64 printk_seq;
+
+ /*
+ * The legacy printer thread is only responsible for nbcon
+ * consoles when the nbcon consoles cannot print via their
+ * atomic or threaded flushing.
+ */
+ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
+ continue;
+
+ if (!console_is_usable(con, flags, false))
+ continue;
+
+ if (flags & CON_NBCON) {
+ printk_seq = nbcon_seq_read(con);
+ } else {
+ /*
+ * It is safe to read @seq because only this
+ * thread context updates @seq.
+ */
+ printk_seq = con->seq;
+ }
+
+ if (prb_read_valid(prb, printk_seq, NULL)) {
+ ret = true;
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+
+ return ret;
+}
+
+static int legacy_kthread_func(void *unused)
+{
+ for (;;) {
+ wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup());
+
+ if (kthread_should_stop())
+ break;
+
+ console_lock();
+ __console_flush_and_unlock();
+ }
+
+ return 0;
+}
+
+static bool legacy_kthread_create(void)
+{
+ struct task_struct *kt;
+
+ lockdep_assert_console_list_lock_held();
+
+ kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy");
+ if (WARN_ON(IS_ERR(kt))) {
+ pr_err("failed to start legacy printing thread\n");
+ return false;
+ }
+
+ printk_legacy_kthread = kt;
+
+ /*
+ * It is important that console printing threads are scheduled
+ * shortly after a printk call and with generous runtime budgets.
+ */
+ sched_set_normal(printk_legacy_kthread, -20);
+
+ return true;
+}
+
+/**
+ * printk_kthreads_shutdown - shutdown all threaded printers
+ *
+ * On system shutdown all threaded printers are stopped. This allows printk
+ * to transition back to atomic printing, thus providing a robust mechanism
+ * for the final shutdown/reboot messages to be output.
+ */
+static void printk_kthreads_shutdown(void)
+{
+ struct console *con;
+
+ console_list_lock();
+ if (printk_kthreads_running) {
+ printk_kthreads_running = false;
+
+ for_each_console(con) {
+ if (con->flags & CON_NBCON)
+ nbcon_kthread_stop(con);
+ }
+
+ /*
+ * The threads may have been stopped while printing a
+ * backlog. Flush any records left over.
+ */
+ nbcon_atomic_flush_pending();
+ }
+ console_list_unlock();
+}
+
+static struct syscore_ops printk_syscore_ops = {
+ .shutdown = printk_kthreads_shutdown,
+};
+
+/*
+ * If appropriate, start nbcon kthreads and set @printk_kthreads_running.
+ * If any kthreads fail to start, those consoles are unregistered.
+ *
+ * Must be called under console_list_lock().
+ */
+static void printk_kthreads_check_locked(void)
+{
+ struct hlist_node *tmp;
+ struct console *con;
+
+ lockdep_assert_console_list_lock_held();
+
+ if (!printk_kthreads_ready)
+ return;
+
+ if (have_legacy_console || have_boot_console) {
+ if (!printk_legacy_kthread &&
+ force_legacy_kthread() &&
+ !legacy_kthread_create()) {
+ /*
+ * All legacy consoles must be unregistered. If there
+ * are any nbcon consoles, they will set up their own
+ * kthread.
+ */
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (con->flags & CON_NBCON)
+ continue;
+
+ unregister_console_locked(con);
+ }
+ }
+ } else if (printk_legacy_kthread) {
+ kthread_stop(printk_legacy_kthread);
+ printk_legacy_kthread = NULL;
+ }
+
+ /*
+ * Printer threads cannot be started as long as any boot console is
+ * registered because there is no way to synchronize the hardware
+ * registers between boot console code and regular console code.
+ * It can only be known that there will be no new boot consoles when
+ * an nbcon console is registered.
+ */
+ if (have_boot_console || !have_nbcon_console) {
+ /* Clear flag in case all nbcon consoles unregistered. */
+ printk_kthreads_running = false;
+ return;
+ }
+
+ if (printk_kthreads_running)
+ return;
+
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (!(con->flags & CON_NBCON))
+ continue;
+
+ if (!nbcon_kthread_create(con))
+ unregister_console_locked(con);
+ }
+
+ printk_kthreads_running = true;
+}
+
+static int __init printk_set_kthreads_ready(void)
+{
+ register_syscore_ops(&printk_syscore_ops);
+
+ console_list_lock();
+ printk_kthreads_ready = true;
+ printk_kthreads_check_locked();
+ console_list_unlock();
+
+ return 0;
+}
+early_initcall(printk_set_kthreads_ready);
+#endif /* CONFIG_PRINTK */
+
static int __read_mostly keep_bootcon;
static int __init keep_bootcon_setup(char *str)
@@ -3446,19 +3820,21 @@ static void try_enable_default_console(struct console *newcon)
newcon->flags |= CON_CONSDEV;
}
-static void console_init_seq(struct console *newcon, bool bootcon_registered)
+/* Return the starting sequence number for a newly registered console. */
+static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered)
{
struct console *con;
bool handover;
+ u64 init_seq;
if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
/* Get a consistent copy of @syslog_seq. */
mutex_lock(&syslog_lock);
- newcon->seq = syslog_seq;
+ init_seq = syslog_seq;
mutex_unlock(&syslog_lock);
} else {
/* Begin with next message added to ringbuffer. */
- newcon->seq = prb_next_seq(prb);
+ init_seq = prb_next_seq(prb);
/*
* If any enabled boot consoles are due to be unregistered
@@ -3479,7 +3855,7 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered)
* Flush all consoles and set the console to start at
* the next unprinted sequence number.
*/
- if (!console_flush_all(true, &newcon->seq, &handover)) {
+ if (!console_flush_all(true, &init_seq, &handover)) {
/*
* Flushing failed. Just choose the lowest
* sequence of the enabled boot consoles.
@@ -3492,19 +3868,30 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered)
if (handover)
console_lock();
- newcon->seq = prb_next_seq(prb);
+ init_seq = prb_next_seq(prb);
for_each_console(con) {
- if ((con->flags & CON_BOOT) &&
- (con->flags & CON_ENABLED) &&
- con->seq < newcon->seq) {
- newcon->seq = con->seq;
+ u64 seq;
+
+ if (!(con->flags & CON_BOOT) ||
+ !(con->flags & CON_ENABLED)) {
+ continue;
}
+
+ if (con->flags & CON_NBCON)
+ seq = nbcon_seq_read(con);
+ else
+ seq = con->seq;
+
+ if (seq < init_seq)
+ init_seq = seq;
}
}
console_unlock();
}
}
+
+ return init_seq;
}
#define console_first() \
@@ -3533,9 +3920,12 @@ static int unregister_console_locked(struct console *console);
*/
void register_console(struct console *newcon)
{
- struct console *con;
+ bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic;
bool bootcon_registered = false;
bool realcon_registered = false;
+ struct console *con;
+ unsigned long flags;
+ u64 init_seq;
int err;
console_list_lock();
@@ -3613,10 +4003,31 @@ void register_console(struct console *newcon)
}
newcon->dropped = 0;
- console_init_seq(newcon, bootcon_registered);
+ init_seq = get_init_console_seq(newcon, bootcon_registered);
- if (newcon->flags & CON_NBCON)
- nbcon_init(newcon);
+ if (newcon->flags & CON_NBCON) {
+ have_nbcon_console = true;
+ nbcon_seq_force(newcon, init_seq);
+ } else {
+ have_legacy_console = true;
+ newcon->seq = init_seq;
+ }
+
+ if (newcon->flags & CON_BOOT)
+ have_boot_console = true;
+
+ /*
+ * If another context is actively using the hardware of this new
+ * console, it will not be aware of the nbcon synchronization. This
+ * is a risk that two contexts could access the hardware
+ * simultaneously if this new console is used for atomic printing
+ * and the other context is still using the hardware.
+ *
+ * Use the driver synchronization to ensure that the hardware is not
+ * in use while this new console transitions to being registered.
+ */
+ if (use_device_lock)
+ newcon->device_lock(newcon, &flags);
/*
* Put this console in the list - keep the
@@ -3642,6 +4053,10 @@ void register_console(struct console *newcon)
* register_console() completes.
*/
+ /* This new console is now registered. */
+ if (use_device_lock)
+ newcon->device_unlock(newcon, flags);
+
console_sysfs_notify();
/*
@@ -3662,6 +4077,9 @@ void register_console(struct console *newcon)
unregister_console_locked(con);
}
}
+
+ /* Changed console list, may require printer threads to start/stop. */
+ printk_kthreads_check_locked();
unlock:
console_list_unlock();
}
@@ -3670,6 +4088,12 @@ EXPORT_SYMBOL(register_console);
/* Must be called under console_list_lock(). */
static int unregister_console_locked(struct console *console)
{
+ bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic;
+ bool found_legacy_con = false;
+ bool found_nbcon_con = false;
+ bool found_boot_con = false;
+ unsigned long flags;
+ struct console *c;
int res;
lockdep_assert_console_list_lock_held();
@@ -3682,14 +4106,29 @@ static int unregister_console_locked(struct console *console)
if (res > 0)
return 0;
+ if (!console_is_registered_locked(console))
+ res = -ENODEV;
+ else if (console_is_usable(console, console->flags, true))
+ __pr_flush(console, 1000, true);
+
/* Disable it unconditionally */
console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
- if (!console_is_registered_locked(console))
- return -ENODEV;
+ if (res < 0)
+ return res;
+
+ /*
+ * Use the driver synchronization to ensure that the hardware is not
+ * in use while this console transitions to being unregistered.
+ */
+ if (use_device_lock)
+ console->device_lock(console, &flags);
hlist_del_init_rcu(&console->node);
+ if (use_device_lock)
+ console->device_unlock(console, flags);
+
/*
* <HISTORICAL>
* If this isn't the last console and it has CON_CONSDEV set, we
@@ -3717,6 +4156,29 @@ static int unregister_console_locked(struct console *console)
if (console->exit)
res = console->exit(console);
+ /*
+ * With this console gone, the global flags tracking registered
+ * console types may have changed. Update them.
+ */
+ for_each_console(c) {
+ if (c->flags & CON_BOOT)
+ found_boot_con = true;
+
+ if (c->flags & CON_NBCON)
+ found_nbcon_con = true;
+ else
+ found_legacy_con = true;
+ }
+ if (!found_boot_con)
+ have_boot_console = found_boot_con;
+ if (!found_legacy_con)
+ have_legacy_console = found_legacy_con;
+ if (!found_nbcon_con)
+ have_nbcon_console = found_nbcon_con;
+
+ /* Changed console list, may require printer threads to start/stop. */
+ printk_kthreads_check_locked();
+
return res;
}
@@ -3863,6 +4325,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
{
unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
unsigned long remaining_jiffies = timeout_jiffies;
+ struct console_flush_type ft;
struct console *c;
u64 last_diff = 0;
u64 printk_seq;
@@ -3871,13 +4334,22 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
u64 diff;
u64 seq;
+ /* Sorry, pr_flush() will not work this early. */
+ if (system_state < SYSTEM_SCHEDULING)
+ return false;
+
might_sleep();
seq = prb_next_reserve_seq(prb);
/* Flush the consoles so that records up to @seq are printed. */
- console_lock();
- console_unlock();
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.legacy_direct) {
+ console_lock();
+ console_unlock();
+ }
for (;;) {
unsigned long begin_jiffies;
@@ -3890,6 +4362,12 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
* console->seq. Releasing console_lock flushes more
* records in case @seq is still not printed on all
* usable consoles.
+ *
+ * Holding the console_lock is not necessary if there
+ * are no legacy or boot consoles. However, such a
+ * console could register at any time. Always hold the
+ * console_lock as a precaution rather than
+ * synchronizing against register_console().
*/
console_lock();
@@ -3905,8 +4383,10 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
* that they make forward progress, so only increment
* @diff for usable consoles.
*/
- if (!console_is_usable(c))
+ if (!console_is_usable(c, flags, true) &&
+ !console_is_usable(c, flags, false)) {
continue;
+ }
if (flags & CON_NBCON) {
printk_seq = nbcon_seq_read(c);
@@ -3974,9 +4454,13 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
int pending = this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_OUTPUT) {
- /* If trylock fails, someone else is doing the printing */
- if (console_trylock())
- console_unlock();
+ if (force_legacy_kthread()) {
+ if (printk_legacy_kthread)
+ wake_up_interruptible(&legacy_wait);
+ } else {
+ if (console_trylock())
+ console_unlock();
+ }
}
if (pending & PRINTK_PENDING_WAKEUP)
@@ -4382,8 +4866,17 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
*/
void console_try_replay_all(void)
{
+ struct console_flush_type ft;
+
+ printk_get_console_flush_type(&ft);
if (console_trylock()) {
__console_rewind_all();
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+ if (ft.legacy_offload)
+ defer_console_output();
/* Consoles are flushed as part of console_unlock(). */
console_unlock();
}
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
index 52626d0f1fa3..4ef81349d9fb 100644
--- a/kernel/printk/printk_ringbuffer.h
+++ b/kernel/printk/printk_ringbuffer.h
@@ -4,7 +4,10 @@
#define _KERNEL_PRINTK_RINGBUFFER_H
#include <linux/atomic.h>
+#include <linux/bits.h>
#include <linux/dev_printk.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
/*
* Meta information about each stored message.
@@ -120,7 +123,7 @@ enum desc_state {
#define _DATA_SIZE(sz_bits) (1UL << (sz_bits))
#define _DESCS_COUNT(ct_bits) (1U << (ct_bits))
-#define DESC_SV_BITS (sizeof(unsigned long) * 8)
+#define DESC_SV_BITS BITS_PER_LONG
#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2)
#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT)
#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT))
@@ -401,10 +404,12 @@ u64 prb_next_reserve_seq(struct printk_ringbuffer *rb);
#define __u64seq_to_ulseq(u64seq) (u64seq)
#define __ulseq_to_u64seq(rb, ulseq) (ulseq)
+#define ULSEQ_MAX(rb) (-1)
#else /* CONFIG_64BIT */
#define __u64seq_to_ulseq(u64seq) ((u32)u64seq)
+#define ULSEQ_MAX(rb) __u64seq_to_ulseq(prb_first_seq(rb) + 0x80000000UL)
static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq)
{
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 6d10927a07d8..2b35a9d3919d 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -26,6 +26,29 @@ void __printk_safe_exit(void)
this_cpu_dec(printk_context);
}
+void __printk_deferred_enter(void)
+{
+ cant_migrate();
+ __printk_safe_enter();
+}
+
+void __printk_deferred_exit(void)
+{
+ cant_migrate();
+ __printk_safe_exit();
+}
+
+bool is_printk_legacy_deferred(void)
+{
+ /*
+ * The per-CPU variable @printk_context can be read safely in any
+ * context. CPU migration is always disabled when set.
+ */
+ return (force_legacy_kthread() ||
+ this_cpu_read(printk_context) ||
+ in_nmi());
+}
+
asmlinkage int vprintk(const char *fmt, va_list args)
{
#ifdef CONFIG_KGDB_KDB
@@ -38,7 +61,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if (this_cpu_read(printk_context) || in_nmi())
+ if (is_printk_legacy_deferred())
return vprintk_deferred(fmt, args);
/* No obstacles. */
diff --git a/kernel/profile.c b/kernel/profile.c
index 2b775cc5c28f..1fcf1adcf4eb 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -47,30 +47,14 @@ static unsigned short int prof_shift;
int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
-static cpumask_var_t prof_cpu_mask;
-#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
-static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
-static DEFINE_PER_CPU(int, cpu_profile_flip);
-static DEFINE_MUTEX(profile_flip_mutex);
-#endif /* CONFIG_SMP */
-
int profile_setup(char *str)
{
static const char schedstr[] = "schedule";
- static const char sleepstr[] = "sleep";
static const char kvmstr[] = "kvm";
const char *select = NULL;
int par;
- if (!strncmp(str, sleepstr, strlen(sleepstr))) {
-#ifdef CONFIG_SCHEDSTATS
- force_schedstat_enabled();
- prof_on = SLEEP_PROFILING;
- select = sleepstr;
-#else
- pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
-#endif /* CONFIG_SCHEDSTATS */
- } else if (!strncmp(str, schedstr, strlen(schedstr))) {
+ if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
select = schedstr;
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
@@ -114,11 +98,6 @@ int __ref profile_init(void)
buffer_bytes = prof_len*sizeof(atomic_t);
- if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_copy(prof_cpu_mask, cpu_possible_mask);
-
prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
if (prof_buffer)
return 0;
@@ -132,195 +111,16 @@ int __ref profile_init(void)
if (prof_buffer)
return 0;
- free_cpumask_var(prof_cpu_mask);
return -ENOMEM;
}
-#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
-/*
- * Each cpu has a pair of open-addressed hashtables for pending
- * profile hits. read_profile() IPI's all cpus to request them
- * to flip buffers and flushes their contents to prof_buffer itself.
- * Flip requests are serialized by the profile_flip_mutex. The sole
- * use of having a second hashtable is for avoiding cacheline
- * contention that would otherwise happen during flushes of pending
- * profile hits required for the accuracy of reported profile hits
- * and so resurrect the interrupt livelock issue.
- *
- * The open-addressed hashtables are indexed by profile buffer slot
- * and hold the number of pending hits to that profile buffer slot on
- * a cpu in an entry. When the hashtable overflows, all pending hits
- * are accounted to their corresponding profile buffer slots with
- * atomic_add() and the hashtable emptied. As numerous pending hits
- * may be accounted to a profile buffer slot in a hashtable entry,
- * this amortizes a number of atomic profile buffer increments likely
- * to be far larger than the number of entries in the hashtable,
- * particularly given that the number of distinct profile buffer
- * positions to which hits are accounted during short intervals (e.g.
- * several seconds) is usually very small. Exclusion from buffer
- * flipping is provided by interrupt disablement (note that for
- * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
- * process context).
- * The hash function is meant to be lightweight as opposed to strong,
- * and was vaguely inspired by ppc64 firmware-supported inverted
- * pagetable hash functions, but uses a full hashtable full of finite
- * collision chains, not just pairs of them.
- *
- * -- nyc
- */
-static void __profile_flip_buffers(void *unused)
-{
- int cpu = smp_processor_id();
-
- per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
-}
-
-static void profile_flip_buffers(void)
-{
- int i, j, cpu;
-
- mutex_lock(&profile_flip_mutex);
- j = per_cpu(cpu_profile_flip, get_cpu());
- put_cpu();
- on_each_cpu(__profile_flip_buffers, NULL, 1);
- for_each_online_cpu(cpu) {
- struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
- for (i = 0; i < NR_PROFILE_HIT; ++i) {
- if (!hits[i].hits) {
- if (hits[i].pc)
- hits[i].pc = 0;
- continue;
- }
- atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
- hits[i].hits = hits[i].pc = 0;
- }
- }
- mutex_unlock(&profile_flip_mutex);
-}
-
-static void profile_discard_flip_buffers(void)
-{
- int i, cpu;
-
- mutex_lock(&profile_flip_mutex);
- i = per_cpu(cpu_profile_flip, get_cpu());
- put_cpu();
- on_each_cpu(__profile_flip_buffers, NULL, 1);
- for_each_online_cpu(cpu) {
- struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
- memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
- }
- mutex_unlock(&profile_flip_mutex);
-}
-
-static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
-{
- unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
- int i, j, cpu;
- struct profile_hit *hits;
-
- pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
- i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
- secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
- cpu = get_cpu();
- hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
- if (!hits) {
- put_cpu();
- return;
- }
- /*
- * We buffer the global profiler buffer into a per-CPU
- * queue and thus reduce the number of global (and possibly
- * NUMA-alien) accesses. The write-queue is self-coalescing:
- */
- local_irq_save(flags);
- do {
- for (j = 0; j < PROFILE_GRPSZ; ++j) {
- if (hits[i + j].pc == pc) {
- hits[i + j].hits += nr_hits;
- goto out;
- } else if (!hits[i + j].hits) {
- hits[i + j].pc = pc;
- hits[i + j].hits = nr_hits;
- goto out;
- }
- }
- i = (i + secondary) & (NR_PROFILE_HIT - 1);
- } while (i != primary);
-
- /*
- * Add the current hit(s) and flush the write-queue out
- * to the global buffer:
- */
- atomic_add(nr_hits, &prof_buffer[pc]);
- for (i = 0; i < NR_PROFILE_HIT; ++i) {
- atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
- hits[i].pc = hits[i].hits = 0;
- }
-out:
- local_irq_restore(flags);
- put_cpu();
-}
-
-static int profile_dead_cpu(unsigned int cpu)
-{
- struct page *page;
- int i;
-
- if (cpumask_available(prof_cpu_mask))
- cpumask_clear_cpu(cpu, prof_cpu_mask);
-
- for (i = 0; i < 2; i++) {
- if (per_cpu(cpu_profile_hits, cpu)[i]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
- per_cpu(cpu_profile_hits, cpu)[i] = NULL;
- __free_page(page);
- }
- }
- return 0;
-}
-
-static int profile_prepare_cpu(unsigned int cpu)
-{
- int i, node = cpu_to_mem(cpu);
- struct page *page;
-
- per_cpu(cpu_profile_flip, cpu) = 0;
-
- for (i = 0; i < 2; i++) {
- if (per_cpu(cpu_profile_hits, cpu)[i])
- continue;
-
- page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
- if (!page) {
- profile_dead_cpu(cpu);
- return -ENOMEM;
- }
- per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
-
- }
- return 0;
-}
-
-static int profile_online_cpu(unsigned int cpu)
-{
- if (cpumask_available(prof_cpu_mask))
- cpumask_set_cpu(cpu, prof_cpu_mask);
-
- return 0;
-}
-
-#else /* !CONFIG_SMP */
-#define profile_flip_buffers() do { } while (0)
-#define profile_discard_flip_buffers() do { } while (0)
-
static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long pc;
pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
- atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
+ if (pc < prof_len)
+ atomic_add(nr_hits, &prof_buffer[pc]);
}
-#endif /* !CONFIG_SMP */
void profile_hits(int type, void *__pc, unsigned int nr_hits)
{
@@ -334,8 +134,8 @@ void profile_tick(int type)
{
struct pt_regs *regs = get_irq_regs();
- if (!user_mode(regs) && cpumask_available(prof_cpu_mask) &&
- cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
+ /* This is the old kernel-only legacy profiling */
+ if (!user_mode(regs))
profile_hit(type, (void *)profile_pc(regs));
}
@@ -358,7 +158,6 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
char *pnt;
unsigned long sample_step = 1UL << prof_shift;
- profile_flip_buffers();
if (p >= (prof_len+1)*sizeof(unsigned int))
return 0;
if (count > (prof_len+1)*sizeof(unsigned int) - p)
@@ -404,7 +203,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
return -EINVAL;
}
#endif
- profile_discard_flip_buffers();
memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
return count;
}
@@ -418,40 +216,14 @@ static const struct proc_ops profile_proc_ops = {
int __ref create_proc_profile(void)
{
struct proc_dir_entry *entry;
-#ifdef CONFIG_SMP
- enum cpuhp_state online_state;
-#endif
-
int err = 0;
if (!prof_on)
return 0;
-#ifdef CONFIG_SMP
- err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
- profile_prepare_cpu, profile_dead_cpu);
- if (err)
- return err;
-
- err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
- profile_online_cpu, NULL);
- if (err < 0)
- goto err_state_prep;
- online_state = err;
- err = 0;
-#endif
entry = proc_create("profile", S_IWUSR | S_IRUGO,
NULL, &profile_proc_ops);
- if (!entry)
- goto err_state_onl;
- proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
-
- return err;
-err_state_onl:
-#ifdef CONFIG_SMP
- cpuhp_remove_state(online_state);
-err_state_prep:
- cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
-#endif
+ if (entry)
+ proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
return err;
}
subsys_initcall(create_proc_profile);
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 38238e595a61..feb3ac1dc5d5 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -54,9 +54,6 @@
* grace-period sequence number.
*/
-#define RCU_SEQ_CTR_SHIFT 2
-#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
-
/* Low-order bit definition for polled grace-period APIs. */
#define RCU_GET_STATE_COMPLETED 0x1
@@ -255,6 +252,11 @@ static inline void debug_rcu_head_callback(struct rcu_head *rhp)
kmem_dump_obj(rhp);
}
+static inline bool rcu_barrier_cb_is_done(struct rcu_head *rhp)
+{
+ return rhp->next == rhp;
+}
+
extern int rcu_cpu_stall_suppress_at_boot;
static inline bool rcu_stall_is_suppressed_at_boot(void)
@@ -606,7 +608,7 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
#endif
#ifdef CONFIG_TINY_RCU
-static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; }
+static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; }
static inline unsigned long rcu_get_gp_seq(void) { return 0; }
static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
static inline unsigned long
@@ -619,7 +621,7 @@ static inline void rcu_fwd_progress_check(unsigned long j) { }
static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
#else /* #ifdef CONFIG_TINY_RCU */
-bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
+bool rcu_watching_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
unsigned long srcu_batches_completed(struct srcu_struct *sp);
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 1693ea22ef1b..298a2c573f02 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -261,17 +261,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
}
/*
- * Mark the specified rcu_segcblist structure as offloaded (or not)
- */
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
-{
- if (offload)
- rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED);
- else
- rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
-}
-
-/*
* Does the specified rcu_segcblist structure contain callbacks that
* are ready to be invoked?
*/
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 4fe877f5f654..259904075636 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -89,16 +89,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
{
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING))
- return true;
-
- return false;
-}
-
-static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp)
-{
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE))
+ rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED))
return true;
return false;
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index b53a9e8f5904..6d37596deb1f 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -39,6 +39,7 @@
#include <linux/torture.h>
#include <linux/vmalloc.h>
#include <linux/rcupdate_trace.h>
+#include <linux/sched/debug.h>
#include "rcu.h"
@@ -104,6 +105,20 @@ static char *scale_type = "rcu";
module_param(scale_type, charp, 0444);
MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)");
+// Structure definitions for custom fixed-per-task allocator.
+struct writer_mblock {
+ struct rcu_head wmb_rh;
+ struct llist_node wmb_node;
+ struct writer_freelist *wmb_wfl;
+};
+
+struct writer_freelist {
+ struct llist_head ws_lhg;
+ atomic_t ws_inflight;
+ struct llist_head ____cacheline_internodealigned_in_smp ws_lhp;
+ struct writer_mblock *ws_mblocks;
+};
+
static int nrealreaders;
static int nrealwriters;
static struct task_struct **writer_tasks;
@@ -111,6 +126,8 @@ static struct task_struct **reader_tasks;
static struct task_struct *shutdown_task;
static u64 **writer_durations;
+static bool *writer_done;
+static struct writer_freelist *writer_freelists;
static int *writer_n_durations;
static atomic_t n_rcu_scale_reader_started;
static atomic_t n_rcu_scale_writer_started;
@@ -120,7 +137,6 @@ static u64 t_rcu_scale_writer_started;
static u64 t_rcu_scale_writer_finished;
static unsigned long b_rcu_gp_test_started;
static unsigned long b_rcu_gp_test_finished;
-static DEFINE_PER_CPU(atomic_t, n_async_inflight);
#define MAX_MEAS 10000
#define MIN_MEAS 100
@@ -143,6 +159,7 @@ struct rcu_scale_ops {
void (*sync)(void);
void (*exp_sync)(void);
struct task_struct *(*rso_gp_kthread)(void);
+ void (*stats)(void);
const char *name;
};
@@ -224,6 +241,11 @@ static void srcu_scale_synchronize(void)
synchronize_srcu(srcu_ctlp);
}
+static void srcu_scale_stats(void)
+{
+ srcu_torture_stats_print(srcu_ctlp, scale_type, SCALE_FLAG);
+}
+
static void srcu_scale_synchronize_expedited(void)
{
synchronize_srcu_expedited(srcu_ctlp);
@@ -241,6 +263,7 @@ static struct rcu_scale_ops srcu_ops = {
.gp_barrier = srcu_rcu_barrier,
.sync = srcu_scale_synchronize,
.exp_sync = srcu_scale_synchronize_expedited,
+ .stats = srcu_scale_stats,
.name = "srcu"
};
@@ -270,6 +293,7 @@ static struct rcu_scale_ops srcud_ops = {
.gp_barrier = srcu_rcu_barrier,
.sync = srcu_scale_synchronize,
.exp_sync = srcu_scale_synchronize_expedited,
+ .stats = srcu_scale_stats,
.name = "srcud"
};
@@ -288,6 +312,11 @@ static void tasks_scale_read_unlock(int idx)
{
}
+static void rcu_tasks_scale_stats(void)
+{
+ rcu_tasks_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
static struct rcu_scale_ops tasks_ops = {
.ptype = RCU_TASKS_FLAVOR,
.init = rcu_sync_scale_init,
@@ -300,6 +329,7 @@ static struct rcu_scale_ops tasks_ops = {
.sync = synchronize_rcu_tasks,
.exp_sync = synchronize_rcu_tasks,
.rso_gp_kthread = get_rcu_tasks_gp_kthread,
+ .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_scale_stats,
.name = "tasks"
};
@@ -326,6 +356,11 @@ static void tasks_rude_scale_read_unlock(int idx)
{
}
+static void rcu_tasks_rude_scale_stats(void)
+{
+ rcu_tasks_rude_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
static struct rcu_scale_ops tasks_rude_ops = {
.ptype = RCU_TASKS_RUDE_FLAVOR,
.init = rcu_sync_scale_init,
@@ -333,11 +368,10 @@ static struct rcu_scale_ops tasks_rude_ops = {
.readunlock = tasks_rude_scale_read_unlock,
.get_gp_seq = rcu_no_completed,
.gp_diff = rcu_seq_diff,
- .async = call_rcu_tasks_rude,
- .gp_barrier = rcu_barrier_tasks_rude,
.sync = synchronize_rcu_tasks_rude,
.exp_sync = synchronize_rcu_tasks_rude,
.rso_gp_kthread = get_rcu_tasks_rude_gp_kthread,
+ .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_rude_scale_stats,
.name = "tasks-rude"
};
@@ -366,6 +400,11 @@ static void tasks_trace_scale_read_unlock(int idx)
rcu_read_unlock_trace();
}
+static void rcu_tasks_trace_scale_stats(void)
+{
+ rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
static struct rcu_scale_ops tasks_tracing_ops = {
.ptype = RCU_TASKS_FLAVOR,
.init = rcu_sync_scale_init,
@@ -378,6 +417,7 @@ static struct rcu_scale_ops tasks_tracing_ops = {
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
.rso_gp_kthread = get_rcu_tasks_trace_gp_kthread,
+ .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats,
.name = "tasks-tracing"
};
@@ -438,12 +478,52 @@ rcu_scale_reader(void *arg)
}
/*
+ * Allocate a writer_mblock structure for the specified rcu_scale_writer
+ * task.
+ */
+static struct writer_mblock *rcu_scale_alloc(long me)
+{
+ struct llist_node *llnp;
+ struct writer_freelist *wflp;
+ struct writer_mblock *wmbp;
+
+ if (WARN_ON_ONCE(!writer_freelists))
+ return NULL;
+ wflp = &writer_freelists[me];
+ if (llist_empty(&wflp->ws_lhp)) {
+ // ->ws_lhp is private to its rcu_scale_writer task.
+ wmbp = container_of(llist_del_all(&wflp->ws_lhg), struct writer_mblock, wmb_node);
+ wflp->ws_lhp.first = &wmbp->wmb_node;
+ }
+ llnp = llist_del_first(&wflp->ws_lhp);
+ if (!llnp)
+ return NULL;
+ return container_of(llnp, struct writer_mblock, wmb_node);
+}
+
+/*
+ * Free a writer_mblock structure to its rcu_scale_writer task.
+ */
+static void rcu_scale_free(struct writer_mblock *wmbp)
+{
+ struct writer_freelist *wflp;
+
+ if (!wmbp)
+ return;
+ wflp = wmbp->wmb_wfl;
+ llist_add(&wmbp->wmb_node, &wflp->ws_lhg);
+}
+
+/*
* Callback function for asynchronous grace periods from rcu_scale_writer().
*/
static void rcu_scale_async_cb(struct rcu_head *rhp)
{
- atomic_dec(this_cpu_ptr(&n_async_inflight));
- kfree(rhp);
+ struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh);
+ struct writer_freelist *wflp = wmbp->wmb_wfl;
+
+ atomic_dec(&wflp->ws_inflight);
+ rcu_scale_free(wmbp);
}
/*
@@ -456,12 +536,14 @@ rcu_scale_writer(void *arg)
int i_max;
unsigned long jdone;
long me = (long)arg;
- struct rcu_head *rhp = NULL;
+ bool selfreport = false;
bool started = false, done = false, alldone = false;
u64 t;
DEFINE_TORTURE_RANDOM(tr);
u64 *wdp;
u64 *wdpp = writer_durations[me];
+ struct writer_freelist *wflp = &writer_freelists[me];
+ struct writer_mblock *wmbp = NULL;
VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
WARN_ON(!wdpp);
@@ -493,30 +575,34 @@ rcu_scale_writer(void *arg)
jdone = jiffies + minruntime * HZ;
do {
+ bool gp_succeeded = false;
+
if (writer_holdoff)
udelay(writer_holdoff);
if (writer_holdoff_jiffies)
schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1);
wdp = &wdpp[i];
*wdp = ktime_get_mono_fast_ns();
- if (gp_async) {
-retry:
- if (!rhp)
- rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
- if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
- atomic_inc(this_cpu_ptr(&n_async_inflight));
- cur_ops->async(rhp, rcu_scale_async_cb);
- rhp = NULL;
+ if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) {
+ if (!wmbp)
+ wmbp = rcu_scale_alloc(me);
+ if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) {
+ atomic_inc(&wflp->ws_inflight);
+ cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb);
+ wmbp = NULL;
+ gp_succeeded = true;
} else if (!kthread_should_stop()) {
cur_ops->gp_barrier();
- goto retry;
} else {
- kfree(rhp); /* Because we are stopping. */
+ rcu_scale_free(wmbp); /* Because we are stopping. */
+ wmbp = NULL;
}
} else if (gp_exp) {
cur_ops->exp_sync();
+ gp_succeeded = true;
} else {
cur_ops->sync();
+ gp_succeeded = true;
}
t = ktime_get_mono_fast_ns();
*wdp = t - *wdp;
@@ -526,6 +612,7 @@ retry:
started = true;
if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) {
done = true;
+ WRITE_ONCE(writer_done[me], true);
sched_set_normal(current, 0);
pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
scale_type, SCALE_FLAG, me, MIN_MEAS);
@@ -551,11 +638,32 @@ retry:
if (done && !alldone &&
atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters)
alldone = true;
- if (started && !alldone && i < MAX_MEAS - 1)
+ if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) {
+ static atomic_t dumped;
+ int i;
+
+ if (!atomic_xchg(&dumped, 1)) {
+ for (i = 0; i < nrealwriters; i++) {
+ if (writer_done[i])
+ continue;
+ pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i);
+ sched_show_task(writer_tasks[i]);
+ }
+ if (cur_ops->stats)
+ cur_ops->stats();
+ }
+ }
+ if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) {
+ pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n",
+ __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone);
+ selfreport = true;
+ }
+ if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1)
i++;
rcu_scale_wait_shutdown();
} while (!torture_must_stop());
- if (gp_async) {
+ if (gp_async && cur_ops->async) {
+ rcu_scale_free(wmbp);
cur_ops->gp_barrier();
}
writer_n_durations[me] = i_max + 1;
@@ -713,6 +821,7 @@ kfree_scale_cleanup(void)
torture_stop_kthread(kfree_scale_thread,
kfree_reader_tasks[i]);
kfree(kfree_reader_tasks);
+ kfree_reader_tasks = NULL;
}
torture_cleanup_end();
@@ -881,6 +990,7 @@ rcu_scale_cleanup(void)
torture_stop_kthread(rcu_scale_reader,
reader_tasks[i]);
kfree(reader_tasks);
+ reader_tasks = NULL;
}
if (writer_tasks) {
@@ -919,10 +1029,33 @@ rcu_scale_cleanup(void)
schedule_timeout_uninterruptible(1);
}
kfree(writer_durations[i]);
+ if (writer_freelists) {
+ int ctr = 0;
+ struct llist_node *llnp;
+ struct writer_freelist *wflp = &writer_freelists[i];
+
+ if (wflp->ws_mblocks) {
+ llist_for_each(llnp, wflp->ws_lhg.first)
+ ctr++;
+ llist_for_each(llnp, wflp->ws_lhp.first)
+ ctr++;
+ WARN_ONCE(ctr != gp_async_max,
+ "%s: ctr = %d gp_async_max = %d\n",
+ __func__, ctr, gp_async_max);
+ kfree(wflp->ws_mblocks);
+ }
+ }
}
kfree(writer_tasks);
+ writer_tasks = NULL;
kfree(writer_durations);
+ writer_durations = NULL;
kfree(writer_n_durations);
+ writer_n_durations = NULL;
+ kfree(writer_done);
+ writer_done = NULL;
+ kfree(writer_freelists);
+ writer_freelists = NULL;
}
/* Do torture-type-specific cleanup operations. */
@@ -949,8 +1082,9 @@ rcu_scale_shutdown(void *arg)
static int __init
rcu_scale_init(void)
{
- long i;
int firsterr = 0;
+ long i;
+ long j;
static struct rcu_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
};
@@ -1017,14 +1151,22 @@ rcu_scale_init(void)
}
while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
schedule_timeout_uninterruptible(1);
- writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
- GFP_KERNEL);
- writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
- GFP_KERNEL);
- writer_n_durations =
- kcalloc(nrealwriters, sizeof(*writer_n_durations),
- GFP_KERNEL);
- if (!writer_tasks || !writer_durations || !writer_n_durations) {
+ writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL);
+ writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL);
+ writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL);
+ writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL);
+ if (gp_async) {
+ if (gp_async_max <= 0) {
+ pr_warn("%s: gp_async_max = %d must be greater than zero.\n",
+ __func__, gp_async_max);
+ WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST));
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+ writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL);
+ }
+ if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done ||
+ (gp_async && !writer_freelists)) {
SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
@@ -1037,6 +1179,24 @@ rcu_scale_init(void)
firsterr = -ENOMEM;
goto unwind;
}
+ if (writer_freelists) {
+ struct writer_freelist *wflp = &writer_freelists[i];
+
+ init_llist_head(&wflp->ws_lhg);
+ init_llist_head(&wflp->ws_lhp);
+ wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]),
+ GFP_KERNEL);
+ if (!wflp->ws_mblocks) {
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (j = 0; j < gp_async_max; j++) {
+ struct writer_mblock *wmbp = &wflp->ws_mblocks[j];
+
+ wmbp->wmb_wfl = wflp;
+ llist_add(&wmbp->wmb_node, &wflp->ws_lhp);
+ }
+ }
firsterr = torture_create_kthread(rcu_scale_writer, (void *)i,
writer_tasks[i]);
if (torture_init_error(firsterr))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 08bf7c669dd3..bb75dbf5c800 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -115,6 +115,7 @@ torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s
torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall.");
torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
+torture_param(int, stall_cpu_repeat, 0, "Number of additional stalls after the first one.");
torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s).");
torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
@@ -366,8 +367,6 @@ struct rcu_torture_ops {
bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2);
unsigned long (*get_gp_state)(void);
void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
- unsigned long (*get_gp_completed)(void);
- void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
unsigned long (*start_gp_poll)(void);
void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
bool (*poll_gp_state)(unsigned long oldstate);
@@ -375,6 +374,8 @@ struct rcu_torture_ops {
bool (*poll_need_2gp)(bool poll, bool poll_full);
void (*cond_sync)(unsigned long oldstate);
void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
+ int poll_active;
+ int poll_active_full;
call_rcu_func_t call;
void (*cb_barrier)(void);
void (*fqs)(void);
@@ -553,8 +554,6 @@ static struct rcu_torture_ops rcu_ops = {
.get_comp_state_full = get_completed_synchronize_rcu_full,
.get_gp_state = get_state_synchronize_rcu,
.get_gp_state_full = get_state_synchronize_rcu_full,
- .get_gp_completed = get_completed_synchronize_rcu,
- .get_gp_completed_full = get_completed_synchronize_rcu_full,
.start_gp_poll = start_poll_synchronize_rcu,
.start_gp_poll_full = start_poll_synchronize_rcu_full,
.poll_gp_state = poll_state_synchronize_rcu,
@@ -562,6 +561,8 @@ static struct rcu_torture_ops rcu_ops = {
.poll_need_2gp = rcu_poll_need_2gp,
.cond_sync = cond_synchronize_rcu,
.cond_sync_full = cond_synchronize_rcu_full,
+ .poll_active = NUM_ACTIVE_RCU_POLL_OLDSTATE,
+ .poll_active_full = NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE,
.get_gp_state_exp = get_state_synchronize_rcu,
.start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
.start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full,
@@ -740,9 +741,12 @@ static struct rcu_torture_ops srcu_ops = {
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .same_gp_state = same_state_synchronize_srcu,
+ .get_comp_state = get_completed_synchronize_srcu,
.get_gp_state = srcu_torture_get_gp_state,
.start_gp_poll = srcu_torture_start_gp_poll,
.poll_gp_state = srcu_torture_poll_gp_state,
+ .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
@@ -780,9 +784,12 @@ static struct rcu_torture_ops srcud_ops = {
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .same_gp_state = same_state_synchronize_srcu,
+ .get_comp_state = get_completed_synchronize_srcu,
.get_gp_state = srcu_torture_get_gp_state,
.start_gp_poll = srcu_torture_start_gp_poll,
.poll_gp_state = srcu_torture_poll_gp_state,
+ .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
@@ -915,11 +922,6 @@ static struct rcu_torture_ops tasks_ops = {
* Definitions for rude RCU-tasks torture testing.
*/
-static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p)
-{
- call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb);
-}
-
static struct rcu_torture_ops tasks_rude_ops = {
.ttype = RCU_TASKS_RUDE_FLAVOR,
.init = rcu_sync_torture_init,
@@ -927,11 +929,8 @@ static struct rcu_torture_ops tasks_rude_ops = {
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock_trivial,
.get_gp_seq = rcu_no_completed,
- .deferred_free = rcu_tasks_rude_torture_deferred_free,
.sync = synchronize_rcu_tasks_rude,
.exp_sync = synchronize_rcu_tasks_rude,
- .call = call_rcu_tasks_rude,
- .cb_barrier = rcu_barrier_tasks_rude,
.gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
.get_gp_data = rcu_tasks_rude_get_gp_data,
.cbflood_max = 50000,
@@ -1318,6 +1317,7 @@ static void rcu_torture_write_types(void)
} else if (gp_sync && !cur_ops->sync) {
pr_alert("%s: gp_sync without primitives.\n", __func__);
}
+ pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes);
}
/*
@@ -1374,17 +1374,20 @@ rcu_torture_writer(void *arg)
int i;
int idx;
int oldnice = task_nice(current);
- struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE];
+ struct rcu_gp_oldstate *rgo = NULL;
+ int rgo_size = 0;
struct rcu_torture *rp;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
unsigned long stallsdone = jiffies;
bool stutter_waited;
- unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+ unsigned long *ulo = NULL;
+ int ulo_size = 0;
// If a new stall test is added, this must be adjusted.
if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu)
- stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ;
+ stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) *
+ HZ * (stall_cpu_repeat + 1);
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
if (!can_expedite)
pr_alert("%s" TORTURE_FLAG
@@ -1401,6 +1404,16 @@ rcu_torture_writer(void *arg)
torture_kthread_stopping("rcu_torture_writer");
return 0;
}
+ if (cur_ops->poll_active > 0) {
+ ulo = kzalloc(cur_ops->poll_active * sizeof(ulo[0]), GFP_KERNEL);
+ if (!WARN_ON(!ulo))
+ ulo_size = cur_ops->poll_active;
+ }
+ if (cur_ops->poll_active_full > 0) {
+ rgo = kzalloc(cur_ops->poll_active_full * sizeof(rgo[0]), GFP_KERNEL);
+ if (!WARN_ON(!rgo))
+ rgo_size = cur_ops->poll_active_full;
+ }
do {
rcu_torture_writer_state = RTWS_FIXED_DELAY;
@@ -1437,8 +1450,8 @@ rcu_torture_writer(void *arg)
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
cookie, cur_ops->get_gp_state());
- if (cur_ops->get_gp_completed) {
- cookie = cur_ops->get_gp_completed();
+ if (cur_ops->get_comp_state) {
+ cookie = cur_ops->get_comp_state();
WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
}
cur_ops->readunlock(idx);
@@ -1452,8 +1465,8 @@ rcu_torture_writer(void *arg)
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
cpumask_pr_args(cpu_online_mask));
- if (cur_ops->get_gp_completed_full) {
- cur_ops->get_gp_completed_full(&cookie_full);
+ if (cur_ops->get_comp_state_full) {
+ cur_ops->get_comp_state_full(&cookie_full);
WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
}
cur_ops->readunlock(idx);
@@ -1502,19 +1515,19 @@ rcu_torture_writer(void *arg)
break;
case RTWS_POLL_GET:
rcu_torture_writer_state = RTWS_POLL_GET;
- for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ for (i = 0; i < ulo_size; i++)
ulo[i] = cur_ops->get_comp_state();
gp_snap = cur_ops->start_gp_poll();
rcu_torture_writer_state = RTWS_POLL_WAIT;
while (!cur_ops->poll_gp_state(gp_snap)) {
gp_snap1 = cur_ops->get_gp_state();
- for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ for (i = 0; i < ulo_size; i++)
if (cur_ops->poll_gp_state(ulo[i]) ||
cur_ops->same_gp_state(ulo[i], gp_snap1)) {
ulo[i] = gp_snap1;
break;
}
- WARN_ON_ONCE(i >= ARRAY_SIZE(ulo));
+ WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size);
torture_hrtimeout_jiffies(torture_random(&rand) % 16,
&rand);
}
@@ -1522,20 +1535,20 @@ rcu_torture_writer(void *arg)
break;
case RTWS_POLL_GET_FULL:
rcu_torture_writer_state = RTWS_POLL_GET_FULL;
- for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ for (i = 0; i < rgo_size; i++)
cur_ops->get_comp_state_full(&rgo[i]);
cur_ops->start_gp_poll_full(&gp_snap_full);
rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
cur_ops->get_gp_state_full(&gp_snap1_full);
- for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ for (i = 0; i < rgo_size; i++)
if (cur_ops->poll_gp_state_full(&rgo[i]) ||
cur_ops->same_gp_state_full(&rgo[i],
&gp_snap1_full)) {
rgo[i] = gp_snap1_full;
break;
}
- WARN_ON_ONCE(i >= ARRAY_SIZE(rgo));
+ WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size);
torture_hrtimeout_jiffies(torture_random(&rand) % 16,
&rand);
}
@@ -1617,6 +1630,8 @@ rcu_torture_writer(void *arg)
pr_alert("%s" TORTURE_FLAG
" Dynamic grace-period expediting was disabled.\n",
torture_type);
+ kfree(ulo);
+ kfree(rgo);
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
return 0;
@@ -2370,7 +2385,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"test_boost=%d/%d test_boost_interval=%d "
"test_boost_duration=%d shutdown_secs=%d "
"stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
- "stall_cpu_block=%d "
+ "stall_cpu_block=%d stall_cpu_repeat=%d "
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d "
"read_exit_delay=%d read_exit_burst=%d "
@@ -2382,7 +2397,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
test_boost, cur_ops->can_boost,
test_boost_interval, test_boost_duration, shutdown_secs,
stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
- stall_cpu_block,
+ stall_cpu_block, stall_cpu_repeat,
n_barrier_cbs,
onoff_interval, onoff_holdoff,
read_exit_delay, read_exit_burst,
@@ -2460,19 +2475,11 @@ static struct notifier_block rcu_torture_stall_block = {
* induces a CPU stall for the time specified by stall_cpu. If a new
* stall test is added, stallsdone in rcu_torture_writer() must be adjusted.
*/
-static int rcu_torture_stall(void *args)
+static void rcu_torture_stall_one(int rep, int irqsoff)
{
int idx;
- int ret;
unsigned long stop_at;
- VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
- if (rcu_cpu_stall_notifiers) {
- ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
- if (ret)
- pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
- __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
- }
if (stall_cpu_holdoff > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
@@ -2492,12 +2499,12 @@ static int rcu_torture_stall(void *args)
stop_at = ktime_get_seconds() + stall_cpu;
/* RCU CPU stall is expected behavior in following code. */
idx = cur_ops->readlock();
- if (stall_cpu_irqsoff)
+ if (irqsoff)
local_irq_disable();
else if (!stall_cpu_block)
preempt_disable();
- pr_alert("%s start on CPU %d.\n",
- __func__, raw_smp_processor_id());
+ pr_alert("%s start stall episode %d on CPU %d.\n",
+ __func__, rep + 1, raw_smp_processor_id());
while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) &&
!kthread_should_stop())
if (stall_cpu_block) {
@@ -2509,12 +2516,42 @@ static int rcu_torture_stall(void *args)
} else if (stall_no_softlockup) {
touch_softlockup_watchdog();
}
- if (stall_cpu_irqsoff)
+ if (irqsoff)
local_irq_enable();
else if (!stall_cpu_block)
preempt_enable();
cur_ops->readunlock(idx);
}
+}
+
+/*
+ * CPU-stall kthread. Invokes rcu_torture_stall_one() once, and then as many
+ * additional times as specified by the stall_cpu_repeat module parameter.
+ * Note that stall_cpu_irqsoff is ignored on the second and subsequent
+ * stall.
+ */
+static int rcu_torture_stall(void *args)
+{
+ int i;
+ int repeat = stall_cpu_repeat;
+ int ret;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+ if (repeat < 0) {
+ repeat = 0;
+ WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST));
+ }
+ if (rcu_cpu_stall_notifiers) {
+ ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+ if (ret)
+ pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+ __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+ }
+ for (i = 0; i <= repeat; i++) {
+ if (kthread_should_stop())
+ break;
+ rcu_torture_stall_one(i, i == 0 ? stall_cpu_irqsoff : 0);
+ }
pr_alert("%s end.\n", __func__);
if (rcu_cpu_stall_notifiers && !ret) {
ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);
@@ -2680,7 +2717,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp)
rcu_torture_fwd_prog_cond_resched(freed);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
local_irq_restore(flags);
}
}
@@ -2830,7 +2867,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
local_irq_restore(flags);
}
}
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index f4ea5b1ec068..0db9db73f57f 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -28,6 +28,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/reboot.h>
#include <linux/sched.h>
+#include <linux/seq_buf.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/stat.h>
@@ -134,7 +135,7 @@ struct ref_scale_ops {
const char *name;
};
-static struct ref_scale_ops *cur_ops;
+static const struct ref_scale_ops *cur_ops;
static void un_delay(const int udl, const int ndl)
{
@@ -170,7 +171,7 @@ static bool rcu_sync_scale_init(void)
return true;
}
-static struct ref_scale_ops rcu_ops = {
+static const struct ref_scale_ops rcu_ops = {
.init = rcu_sync_scale_init,
.readsection = ref_rcu_read_section,
.delaysection = ref_rcu_delay_section,
@@ -204,7 +205,7 @@ static void srcu_ref_scale_delay_section(const int nloops, const int udl, const
}
}
-static struct ref_scale_ops srcu_ops = {
+static const struct ref_scale_ops srcu_ops = {
.init = rcu_sync_scale_init,
.readsection = srcu_ref_scale_read_section,
.delaysection = srcu_ref_scale_delay_section,
@@ -231,7 +232,7 @@ static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, c
un_delay(udl, ndl);
}
-static struct ref_scale_ops rcu_tasks_ops = {
+static const struct ref_scale_ops rcu_tasks_ops = {
.init = rcu_sync_scale_init,
.readsection = rcu_tasks_ref_scale_read_section,
.delaysection = rcu_tasks_ref_scale_delay_section,
@@ -270,7 +271,7 @@ static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, c
}
}
-static struct ref_scale_ops rcu_trace_ops = {
+static const struct ref_scale_ops rcu_trace_ops = {
.init = rcu_sync_scale_init,
.readsection = rcu_trace_ref_scale_read_section,
.delaysection = rcu_trace_ref_scale_delay_section,
@@ -309,7 +310,7 @@ static void ref_refcnt_delay_section(const int nloops, const int udl, const int
}
}
-static struct ref_scale_ops refcnt_ops = {
+static const struct ref_scale_ops refcnt_ops = {
.init = rcu_sync_scale_init,
.readsection = ref_refcnt_section,
.delaysection = ref_refcnt_delay_section,
@@ -346,7 +347,7 @@ static void ref_rwlock_delay_section(const int nloops, const int udl, const int
}
}
-static struct ref_scale_ops rwlock_ops = {
+static const struct ref_scale_ops rwlock_ops = {
.init = ref_rwlock_init,
.readsection = ref_rwlock_section,
.delaysection = ref_rwlock_delay_section,
@@ -383,7 +384,7 @@ static void ref_rwsem_delay_section(const int nloops, const int udl, const int n
}
}
-static struct ref_scale_ops rwsem_ops = {
+static const struct ref_scale_ops rwsem_ops = {
.init = ref_rwsem_init,
.readsection = ref_rwsem_section,
.delaysection = ref_rwsem_delay_section,
@@ -418,7 +419,7 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd
preempt_enable();
}
-static struct ref_scale_ops lock_ops = {
+static const struct ref_scale_ops lock_ops = {
.readsection = ref_lock_section,
.delaysection = ref_lock_delay_section,
.name = "lock"
@@ -453,7 +454,7 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in
preempt_enable();
}
-static struct ref_scale_ops lock_irq_ops = {
+static const struct ref_scale_ops lock_irq_ops = {
.readsection = ref_lock_irq_section,
.delaysection = ref_lock_irq_delay_section,
.name = "lock-irq"
@@ -489,7 +490,7 @@ static void ref_acqrel_delay_section(const int nloops, const int udl, const int
preempt_enable();
}
-static struct ref_scale_ops acqrel_ops = {
+static const struct ref_scale_ops acqrel_ops = {
.readsection = ref_acqrel_section,
.delaysection = ref_acqrel_delay_section,
.name = "acqrel"
@@ -523,7 +524,7 @@ static void ref_clock_delay_section(const int nloops, const int udl, const int n
stopopts = x;
}
-static struct ref_scale_ops clock_ops = {
+static const struct ref_scale_ops clock_ops = {
.readsection = ref_clock_section,
.delaysection = ref_clock_delay_section,
.name = "clock"
@@ -555,7 +556,7 @@ static void ref_jiffies_delay_section(const int nloops, const int udl, const int
stopopts = x;
}
-static struct ref_scale_ops jiffies_ops = {
+static const struct ref_scale_ops jiffies_ops = {
.readsection = ref_jiffies_section,
.delaysection = ref_jiffies_delay_section,
.name = "jiffies"
@@ -705,9 +706,9 @@ static void refscale_typesafe_ctor(void *rtsp_in)
preempt_enable();
}
-static struct ref_scale_ops typesafe_ref_ops;
-static struct ref_scale_ops typesafe_lock_ops;
-static struct ref_scale_ops typesafe_seqlock_ops;
+static const struct ref_scale_ops typesafe_ref_ops;
+static const struct ref_scale_ops typesafe_lock_ops;
+static const struct ref_scale_ops typesafe_seqlock_ops;
// Initialize for a typesafe test.
static bool typesafe_init(void)
@@ -768,7 +769,7 @@ static void typesafe_cleanup(void)
}
// The typesafe_init() function distinguishes these structures by address.
-static struct ref_scale_ops typesafe_ref_ops = {
+static const struct ref_scale_ops typesafe_ref_ops = {
.init = typesafe_init,
.cleanup = typesafe_cleanup,
.readsection = typesafe_read_section,
@@ -776,7 +777,7 @@ static struct ref_scale_ops typesafe_ref_ops = {
.name = "typesafe_ref"
};
-static struct ref_scale_ops typesafe_lock_ops = {
+static const struct ref_scale_ops typesafe_lock_ops = {
.init = typesafe_init,
.cleanup = typesafe_cleanup,
.readsection = typesafe_read_section,
@@ -784,7 +785,7 @@ static struct ref_scale_ops typesafe_lock_ops = {
.name = "typesafe_lock"
};
-static struct ref_scale_ops typesafe_seqlock_ops = {
+static const struct ref_scale_ops typesafe_seqlock_ops = {
.init = typesafe_init,
.cleanup = typesafe_cleanup,
.readsection = typesafe_read_section,
@@ -891,32 +892,34 @@ static u64 process_durations(int n)
{
int i;
struct reader_task *rt;
- char buf1[64];
+ struct seq_buf s;
char *buf;
u64 sum = 0;
buf = kmalloc(800 + 64, GFP_KERNEL);
if (!buf)
return 0;
- buf[0] = 0;
- sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
- exp_idx);
+ seq_buf_init(&s, buf, 800 + 64);
+
+ seq_buf_printf(&s, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
+ exp_idx);
for (i = 0; i < n && !torture_must_stop(); i++) {
rt = &(reader_tasks[i]);
- sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
if (i % 5 == 0)
- strcat(buf, "\n");
- if (strlen(buf) >= 800) {
- pr_alert("%s", buf);
- buf[0] = 0;
+ seq_buf_putc(&s, '\n');
+
+ if (seq_buf_used(&s) >= 800) {
+ pr_alert("%s", seq_buf_str(&s));
+ seq_buf_clear(&s);
}
- strcat(buf, buf1);
+
+ seq_buf_printf(&s, "%d: %llu\t", i, rt->last_duration_ns);
sum += rt->last_duration_ns;
}
- pr_alert("%s\n", buf);
+ pr_alert("%s\n", seq_buf_str(&s));
kfree(buf);
return sum;
@@ -1023,7 +1026,7 @@ end:
}
static void
-ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
+ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
"--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
@@ -1078,7 +1081,7 @@ ref_scale_init(void)
{
long i;
int firsterr = 0;
- static struct ref_scale_ops *scale_ops[] = {
+ static const struct ref_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
&rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index b24db425f16d..31706e3293bc 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -137,6 +137,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
sdp->srcu_cblist_invoking = false;
sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
+ sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head;
sdp->mynode = NULL;
sdp->cpu = cpu;
INIT_WORK(&sdp->work, srcu_invoke_callbacks);
@@ -247,7 +248,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
ssp->srcu_idx = 0;
- ssp->srcu_sup->srcu_gp_seq = 0;
+ ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_barrier_seq = 0;
mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
@@ -258,7 +259,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
if (!ssp->sda)
goto err_free_sup;
init_srcu_struct_data(ssp);
- ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
+ ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
@@ -266,7 +267,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
}
ssp->srcu_sup->srcu_ssp = ssp;
- smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
+ smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed,
+ SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */
return 0;
err_free_sda:
@@ -628,6 +630,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
if (time_after(j, gpstart))
jbase += j - gpstart;
if (!jbase) {
+ ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay);
WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1);
if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
jbase = 1;
@@ -1560,6 +1563,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
struct srcu_data *sdp;
struct srcu_struct *ssp;
+ rhp->next = rhp; // Mark the callback as having been invoked.
sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
ssp = sdp->ssp;
if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
@@ -1818,6 +1822,7 @@ static void process_srcu(struct work_struct *work)
} else {
j = jiffies;
if (READ_ONCE(sup->reschedule_jiffies) == j) {
+ ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count);
WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
curdelay = 1;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index ba3440a45b6d..6333f4ccf024 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -34,6 +34,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @rtp_blkd_tasks: List of tasks blocked as readers.
* @rtp_exit_list: List of tasks in the latter portion of do_exit().
* @cpu: CPU number corresponding to this entry.
+ * @index: Index of this CPU in rtpcp_array of the rcu_tasks structure.
* @rtpp: Pointer to the rcu_tasks structure.
*/
struct rcu_tasks_percpu {
@@ -49,6 +50,7 @@ struct rcu_tasks_percpu {
struct list_head rtp_blkd_tasks;
struct list_head rtp_exit_list;
int cpu;
+ int index;
struct rcu_tasks *rtpp;
};
@@ -63,7 +65,7 @@ struct rcu_tasks_percpu {
* @init_fract: Initial backoff sleep interval.
* @gp_jiffies: Time of last @gp_state transition.
* @gp_start: Most recent grace-period start in jiffies.
- * @tasks_gp_seq: Number of grace periods completed since boot.
+ * @tasks_gp_seq: Number of grace periods completed since boot in upper bits.
* @n_ipis: Number of IPIs sent to encourage grace periods to end.
* @n_ipis_fails: Number of IPI-send failures.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
@@ -76,6 +78,7 @@ struct rcu_tasks_percpu {
* @call_func: This flavor's call_rcu()-equivalent function.
* @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE).
* @rtpcpu: This flavor's rcu_tasks_percpu structure.
+ * @rtpcp_array: Array of pointers to rcu_tasks_percpu structure of CPUs in cpu_possible_mask.
* @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
* @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
* @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing.
@@ -84,6 +87,7 @@ struct rcu_tasks_percpu {
* @barrier_q_count: Number of queues being waited on.
* @barrier_q_completion: Barrier wait/wakeup mechanism.
* @barrier_q_seq: Sequence number for barrier operations.
+ * @barrier_q_start: Most recent barrier start in jiffies.
* @name: This flavor's textual name.
* @kname: This flavor's kthread name.
*/
@@ -110,6 +114,7 @@ struct rcu_tasks {
call_rcu_func_t call_func;
unsigned int wait_state;
struct rcu_tasks_percpu __percpu *rtpcpu;
+ struct rcu_tasks_percpu **rtpcp_array;
int percpu_enqueue_shift;
int percpu_enqueue_lim;
int percpu_dequeue_lim;
@@ -118,6 +123,7 @@ struct rcu_tasks {
atomic_t barrier_q_count;
struct completion barrier_q_completion;
unsigned long barrier_q_seq;
+ unsigned long barrier_q_start;
char *name;
char *kname;
};
@@ -182,6 +188,8 @@ module_param(rcu_task_collapse_lim, int, 0444);
static int rcu_task_lazy_lim __read_mostly = 32;
module_param(rcu_task_lazy_lim, int, 0444);
+static int rcu_task_cpu_ids;
+
/* RCU tasks grace-period state for debugging. */
#define RTGS_INIT 0
#define RTGS_WAIT_WAIT_CBS 1
@@ -245,6 +253,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
int cpu;
int lim;
int shift;
+ int maxcpu;
+ int index = 0;
if (rcu_task_enqueue_lim < 0) {
rcu_task_enqueue_lim = 1;
@@ -254,14 +264,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
}
lim = rcu_task_enqueue_lim;
- if (lim > nr_cpu_ids)
- lim = nr_cpu_ids;
- shift = ilog2(nr_cpu_ids / lim);
- if (((nr_cpu_ids - 1) >> shift) >= lim)
- shift++;
- WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
- WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
- smp_store_release(&rtp->percpu_enqueue_lim, lim);
+ rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL);
+ BUG_ON(!rtp->rtpcp_array);
+
for_each_possible_cpu(cpu) {
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
@@ -273,14 +278,30 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
rtpcp->cpu = cpu;
rtpcp->rtpp = rtp;
+ rtpcp->index = index;
+ rtp->rtpcp_array[index] = rtpcp;
+ index++;
if (!rtpcp->rtp_blkd_tasks.next)
INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
if (!rtpcp->rtp_exit_list.next)
INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+ rtpcp->barrier_q_head.next = &rtpcp->barrier_q_head;
+ maxcpu = cpu;
}
- pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
- data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust);
+ rcu_task_cpu_ids = maxcpu + 1;
+ if (lim > rcu_task_cpu_ids)
+ lim = rcu_task_cpu_ids;
+ shift = ilog2(rcu_task_cpu_ids / lim);
+ if (((rcu_task_cpu_ids - 1) >> shift) >= lim)
+ shift++;
+ WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
+ smp_store_release(&rtp->percpu_enqueue_lim, lim);
+
+ pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n",
+ rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim),
+ rcu_task_cb_adjust, rcu_task_cpu_ids);
}
// Compute wakeup time for lazy callback timer.
@@ -339,6 +360,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
rcu_read_lock();
ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift);
chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask);
+ WARN_ON_ONCE(chosen_cpu >= rcu_task_cpu_ids);
rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu);
if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
@@ -348,7 +370,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
rtpcp->rtp_n_lock_retries = 0;
}
if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim &&
- READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
+ READ_ONCE(rtp->percpu_enqueue_lim) != rcu_task_cpu_ids)
needadjust = true; // Defer adjustment to avoid deadlock.
}
// Queuing callbacks before initialization not yet supported.
@@ -368,10 +390,10 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
if (unlikely(needadjust)) {
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
- if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
+ if (rtp->percpu_enqueue_lim != rcu_task_cpu_ids) {
WRITE_ONCE(rtp->percpu_enqueue_shift, 0);
- WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
- smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, rcu_task_cpu_ids);
+ smp_store_release(&rtp->percpu_enqueue_lim, rcu_task_cpu_ids);
pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
}
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
@@ -388,6 +410,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
struct rcu_tasks *rtp;
struct rcu_tasks_percpu *rtpcp;
+ rhp->next = rhp; // Mark the callback as having been invoked.
rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head);
rtp = rtpcp->rtpp;
if (atomic_dec_and_test(&rtp->barrier_q_count))
@@ -396,7 +419,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
// Wait for all in-flight callbacks for the specified RCU Tasks flavor.
// Operates in a manner similar to rcu_barrier().
-static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
+static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
{
int cpu;
unsigned long flags;
@@ -409,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
mutex_unlock(&rtp->barrier_q_mutex);
return;
}
+ rtp->barrier_q_start = jiffies;
rcu_seq_start(&rtp->barrier_q_seq);
init_completion(&rtp->barrier_q_completion);
atomic_set(&rtp->barrier_q_count, 2);
@@ -444,6 +468,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
for (cpu = 0; cpu < dequeue_limit; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
/* Advance and accelerate any new callbacks. */
@@ -481,7 +507,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rtp->percpu_enqueue_lim > 1) {
- WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
+ WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(rcu_task_cpu_ids));
smp_store_release(&rtp->percpu_enqueue_lim, 1);
rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
gpdone = false;
@@ -496,7 +522,9 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
}
if (rtp->percpu_dequeue_lim == 1) {
- for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+ for (cpu = rtp->percpu_dequeue_lim; cpu < rcu_task_cpu_ids; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
@@ -511,30 +539,32 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
// Advance callbacks and invoke any that are ready.
static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp)
{
- int cpu;
- int cpunext;
int cpuwq;
unsigned long flags;
int len;
+ int index;
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
struct rcu_tasks_percpu *rtpcp_next;
- cpu = rtpcp->cpu;
- cpunext = cpu * 2 + 1;
- if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
- rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
- cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
- queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
- cpunext++;
- if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
- rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
- cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+ index = rtpcp->index * 2 + 1;
+ if (index < num_possible_cpus()) {
+ rtpcp_next = rtp->rtpcp_array[index];
+ if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND;
queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+ index++;
+ if (index < num_possible_cpus()) {
+ rtpcp_next = rtp->rtpcp_array[index];
+ if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+ }
+ }
}
}
- if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu))
+ if (rcu_segcblist_empty(&rtpcp->cblist))
return;
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
@@ -687,9 +717,7 @@ static void __init rcu_tasks_bootup_oddness(void)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
-#endif /* #ifndef CONFIG_TINY_RCU */
-#ifndef CONFIG_TINY_RCU
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
@@ -723,6 +751,53 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
rtp->lazy_jiffies,
s);
}
+
+/* Dump out more rcutorture-relevant state common to all RCU-tasks flavors. */
+static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *tt,
+ char *tf, char *tst)
+{
+ cpumask_var_t cm;
+ int cpu;
+ bool gotcb = false;
+ unsigned long j = jiffies;
+
+ pr_alert("%s%s Tasks%s RCU g%ld gp_start %lu gp_jiffies %lu gp_state %d (%s).\n",
+ tt, tf, tst, data_race(rtp->tasks_gp_seq),
+ j - data_race(rtp->gp_start), j - data_race(rtp->gp_jiffies),
+ data_race(rtp->gp_state), tasks_gp_state_getname(rtp));
+ pr_alert("\tEnqueue shift %d limit %d Dequeue limit %d gpseq %lu.\n",
+ data_race(rtp->percpu_enqueue_shift),
+ data_race(rtp->percpu_enqueue_lim),
+ data_race(rtp->percpu_dequeue_lim),
+ data_race(rtp->percpu_dequeue_gpseq));
+ (void)zalloc_cpumask_var(&cm, GFP_KERNEL);
+ pr_alert("\tCallback counts:");
+ for_each_possible_cpu(cpu) {
+ long n;
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ if (cpumask_available(cm) && !rcu_barrier_cb_is_done(&rtpcp->barrier_q_head))
+ cpumask_set_cpu(cpu, cm);
+ n = rcu_segcblist_n_cbs(&rtpcp->cblist);
+ if (!n)
+ continue;
+ pr_cont(" %d:%ld", cpu, n);
+ gotcb = true;
+ }
+ if (gotcb)
+ pr_cont(".\n");
+ else
+ pr_cont(" (none).\n");
+ pr_alert("\tBarrier seq %lu start %lu count %d holdout CPUs ",
+ data_race(rtp->barrier_q_seq), j - data_race(rtp->barrier_q_start),
+ atomic_read(&rtp->barrier_q_count));
+ if (cpumask_available(cm) && !cpumask_empty(cm))
+ pr_cont(" %*pbl.\n", cpumask_pr_args(cm));
+ else
+ pr_cont("(none).\n");
+ free_cpumask_var(cm);
+}
+
#endif // #ifndef CONFIG_TINY_RCU
static void exit_tasks_rcu_finish_trace(struct task_struct *t);
@@ -1174,6 +1249,12 @@ void show_rcu_tasks_classic_gp_kthread(void)
show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
}
EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
+
+void rcu_tasks_torture_stats_print(char *tt, char *tf)
+{
+ rcu_tasks_torture_stats_print_generic(&rcu_tasks, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_torture_stats_print);
#endif // !defined(CONFIG_TINY_RCU)
struct task_struct *get_rcu_tasks_gp_kthread(void)
@@ -1244,13 +1325,12 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
////////////////////////////////////////////////////////////////////////
//
-// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of
-// passing an empty function to schedule_on_each_cpu(). This approach
-// provides an asynchronous call_rcu_tasks_rude() API and batching of
-// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API.
-// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
-// and induces otherwise unnecessary context switches on all online CPUs,
-// whether idle or not.
+// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's
+// trick of passing an empty function to schedule_on_each_cpu().
+// This approach provides batching of concurrent calls to the synchronous
+// synchronize_rcu_tasks_rude() API. This invokes schedule_on_each_cpu()
+// in order to send IPIs far and wide and induces otherwise unnecessary
+// context switches on all online CPUs, whether idle or not.
//
// Callback handling is provided by the rcu_tasks_kthread() function.
//
@@ -1268,11 +1348,11 @@ static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
schedule_on_each_cpu(rcu_tasks_be_rude);
}
-void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func);
+static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
"RCU Tasks Rude");
-/**
+/*
* call_rcu_tasks_rude() - Queue a callback rude task-based grace period
* @rhp: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
@@ -1289,12 +1369,14 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
+ *
+ * This is no longer exported, and is instead reserved for use by
+ * synchronize_rcu_tasks_rude().
*/
-void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func)
+static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func)
{
call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude);
}
-EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);
/**
* synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period
@@ -1320,26 +1402,9 @@ void synchronize_rcu_tasks_rude(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
-/**
- * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks.
- *
- * Although the current implementation is guaranteed to wait, it is not
- * obligated to, for example, if there are no pending callbacks.
- */
-void rcu_barrier_tasks_rude(void)
-{
- rcu_barrier_tasks_generic(&rcu_tasks_rude);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
-
-int rcu_tasks_rude_lazy_ms = -1;
-module_param(rcu_tasks_rude_lazy_ms, int, 0444);
-
static int __init rcu_spawn_tasks_rude_kthread(void)
{
rcu_tasks_rude.gp_sleep = HZ / 10;
- if (rcu_tasks_rude_lazy_ms >= 0)
- rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
}
@@ -1350,6 +1415,12 @@ void show_rcu_tasks_rude_gp_kthread(void)
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
}
EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
+
+void rcu_tasks_rude_torture_stats_print(char *tt, char *tf)
+{
+ rcu_tasks_torture_stats_print_generic(&rcu_tasks_rude, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_rude_torture_stats_print);
#endif // !defined(CONFIG_TINY_RCU)
struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
@@ -1613,7 +1684,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
// However, we cannot safely change its state.
n_heavy_reader_attempts++;
// Check for "running" idle tasks on offline CPUs.
- if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
+ if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting))
return -EINVAL; // No quiescent state, do it the hard way.
n_heavy_reader_updates++;
nesting = 0;
@@ -2027,6 +2098,12 @@ void show_rcu_tasks_trace_gp_kthread(void)
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
}
EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
+
+void rcu_tasks_trace_torture_stats_print(char *tt, char *tf)
+{
+ rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print);
#endif // !defined(CONFIG_TINY_RCU)
struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
@@ -2070,17 +2147,13 @@ static struct rcu_tasks_test_desc tests[] = {
.notrun = IS_ENABLED(CONFIG_TASKS_RCU),
},
{
- .name = "call_rcu_tasks_rude()",
- /* If not defined, the test is skipped. */
- .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
- },
- {
.name = "call_rcu_tasks_trace()",
/* If not defined, the test is skipped. */
.notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
}
};
+#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
static void test_rcu_tasks_callback(struct rcu_head *rhp)
{
struct rcu_tasks_test_desc *rttd =
@@ -2090,6 +2163,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
rttd->notrun = false;
}
+#endif // #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
static void rcu_tasks_initiate_self_tests(void)
{
@@ -2102,16 +2176,14 @@ static void rcu_tasks_initiate_self_tests(void)
#ifdef CONFIG_TASKS_RUDE_RCU
pr_info("Running RCU Tasks Rude wait API self tests\n");
- tests[1].runstart = jiffies;
synchronize_rcu_tasks_rude();
- call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
pr_info("Running RCU Tasks Trace wait API self tests\n");
- tests[2].runstart = jiffies;
+ tests[1].runstart = jiffies;
synchronize_rcu_tasks_trace();
- call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
+ call_rcu_tasks_trace(&tests[1].rh, test_rcu_tasks_callback);
#endif
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 4402d6f5f857..b3b3ce34df63 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -105,7 +105,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
}
/* Invoke the RCU callbacks whose grace period has elapsed. */
-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(void)
{
struct rcu_head *next, *list;
unsigned long flags;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e641cc681901..e056230551f7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -79,9 +79,6 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.gpwrap = true,
-#ifdef CONFIG_RCU_NOCB_CPU
- .cblist.flags = SEGCBLIST_RCU_CORE,
-#endif
};
static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
@@ -97,6 +94,9 @@ static struct rcu_state rcu_state = {
.srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
rcu_sr_normal_gp_cleanup_work),
.srs_cleanups_pending = ATOMIC_INIT(0),
+#ifdef CONFIG_RCU_NOCB_CPU
+ .nocb_mutex = __MUTEX_INITIALIZER(rcu_state.nocb_mutex),
+#endif
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -283,37 +283,45 @@ void rcu_softirq_qs(void)
}
/*
- * Reset the current CPU's ->dynticks counter to indicate that the
+ * Reset the current CPU's RCU_WATCHING counter to indicate that the
* newly onlined CPU is no longer in an extended quiescent state.
* This will either leave the counter unchanged, or increment it
* to the next non-quiescent value.
*
* The non-atomic test/increment sequence works because the upper bits
- * of the ->dynticks counter are manipulated only by the corresponding CPU,
+ * of the ->state variable are manipulated only by the corresponding CPU,
* or when the corresponding CPU is offline.
*/
-static void rcu_dynticks_eqs_online(void)
+static void rcu_watching_online(void)
{
- if (ct_dynticks() & RCU_DYNTICKS_IDX)
+ if (ct_rcu_watching() & CT_RCU_WATCHING)
return;
- ct_state_inc(RCU_DYNTICKS_IDX);
+ ct_state_inc(CT_RCU_WATCHING);
}
/*
- * Return true if the snapshot returned from rcu_dynticks_snap()
+ * Return true if the snapshot returned from ct_rcu_watching()
* indicates that RCU is in an extended quiescent state.
*/
-static bool rcu_dynticks_in_eqs(int snap)
+static bool rcu_watching_snap_in_eqs(int snap)
{
- return !(snap & RCU_DYNTICKS_IDX);
+ return !(snap & CT_RCU_WATCHING);
}
-/*
- * Return true if the CPU corresponding to the specified rcu_data
- * structure has spent some time in an extended quiescent state since
- * rcu_dynticks_snap() returned the specified snapshot.
+/**
+ * rcu_watching_snap_stopped_since() - Has RCU stopped watching a given CPU
+ * since the specified @snap?
+ *
+ * @rdp: The rcu_data corresponding to the CPU for which to check EQS.
+ * @snap: rcu_watching snapshot taken when the CPU wasn't in an EQS.
+ *
+ * Returns true if the CPU corresponding to @rdp has spent some time in an
+ * extended quiescent state since @snap. Note that this doesn't check if it
+ * /still/ is in an EQS, just that it went through one since @snap.
+ *
+ * This is meant to be used in a loop waiting for a CPU to go through an EQS.
*/
-static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
+static bool rcu_watching_snap_stopped_since(struct rcu_data *rdp, int snap)
{
/*
* The first failing snapshot is already ordered against the accesses
@@ -323,26 +331,29 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
* performed by the remote CPU prior to entering idle and therefore can
* rely solely on acquire semantics.
*/
- return snap != ct_dynticks_cpu_acquire(rdp->cpu);
+ if (WARN_ON_ONCE(rcu_watching_snap_in_eqs(snap)))
+ return true;
+
+ return snap != ct_rcu_watching_cpu_acquire(rdp->cpu);
}
/*
* Return true if the referenced integer is zero while the specified
* CPU remains within a single extended quiescent state.
*/
-bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
+bool rcu_watching_zero_in_eqs(int cpu, int *vp)
{
int snap;
// If not quiescent, force back to earlier extended quiescent state.
- snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX;
- smp_rmb(); // Order ->dynticks and *vp reads.
+ snap = ct_rcu_watching_cpu(cpu) & ~CT_RCU_WATCHING;
+ smp_rmb(); // Order CT state and *vp reads.
if (READ_ONCE(*vp))
return false; // Non-zero, so report failure;
- smp_rmb(); // Order *vp read and ->dynticks re-read.
+ smp_rmb(); // Order *vp read and CT state re-read.
// If still in the same extended quiescent state, we are good!
- return snap == ct_dynticks_cpu(cpu);
+ return snap == ct_rcu_watching_cpu(cpu);
}
/*
@@ -356,17 +367,17 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
*
* The caller must have disabled interrupts and must not be idle.
*/
-notrace void rcu_momentary_dyntick_idle(void)
+notrace void rcu_momentary_eqs(void)
{
int seq;
raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
- seq = ct_state_inc(2 * RCU_DYNTICKS_IDX);
+ seq = ct_state_inc(2 * CT_RCU_WATCHING);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX));
+ WARN_ON_ONCE(!(seq & CT_RCU_WATCHING));
rcu_preempt_deferred_qs(current);
}
-EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
+EXPORT_SYMBOL_GPL(rcu_momentary_eqs);
/**
* rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle
@@ -388,13 +399,13 @@ static int rcu_is_cpu_rrupt_from_idle(void)
lockdep_assert_irqs_disabled();
/* Check for counter underflows */
- RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0,
- "RCU dynticks_nesting counter underflow!");
- RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0,
- "RCU dynticks_nmi_nesting counter underflow/zero!");
+ RCU_LOCKDEP_WARN(ct_nesting() < 0,
+ "RCU nesting counter underflow!");
+ RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0,
+ "RCU nmi_nesting counter underflow/zero!");
/* Are we at first interrupt nesting level? */
- nesting = ct_dynticks_nmi_nesting();
+ nesting = ct_nmi_nesting();
if (nesting > 1)
return false;
@@ -404,7 +415,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
WARN_ON_ONCE(!nesting && !is_idle_task(current));
/* Does CPU appear to be idle from an RCU standpoint? */
- return ct_dynticks_nesting() == 0;
+ return ct_nesting() == 0;
}
#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
@@ -596,12 +607,12 @@ void rcu_irq_exit_check_preempt(void)
{
lockdep_assert_irqs_disabled();
- RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0,
- "RCU dynticks_nesting counter underflow/zero!");
- RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() !=
- DYNTICK_IRQ_NONIDLE,
- "Bad RCU dynticks_nmi_nesting counter\n");
- RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
+ RCU_LOCKDEP_WARN(ct_nesting() <= 0,
+ "RCU nesting counter underflow/zero!");
+ RCU_LOCKDEP_WARN(ct_nmi_nesting() !=
+ CT_NESTING_IRQ_NONIDLE,
+ "Bad RCU nmi_nesting counter\n");
+ RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(),
"RCU in extended quiescent state!");
}
#endif /* #ifdef CONFIG_PROVE_RCU */
@@ -641,7 +652,7 @@ void __rcu_irq_enter_check_tick(void)
if (in_nmi())
return;
- RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
+ RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(),
"Illegal rcu_irq_enter_check_tick() from extended quiescent state");
if (!tick_nohz_full_cpu(rdp->cpu) ||
@@ -723,7 +734,7 @@ notrace bool rcu_is_watching(void)
bool ret;
preempt_disable_notrace();
- ret = !rcu_dynticks_curr_cpu_in_eqs();
+ ret = rcu_is_watching_curr_cpu();
preempt_enable_notrace();
return ret;
}
@@ -765,11 +776,11 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
}
/*
- * Snapshot the specified CPU's dynticks counter so that we can later
+ * Snapshot the specified CPU's RCU_WATCHING counter so that we can later
* credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state.
*/
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
+static int rcu_watching_snap_save(struct rcu_data *rdp)
{
/*
* Full ordering between remote CPU's post idle accesses and updater's
@@ -782,8 +793,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
* Ordering between remote CPU's pre idle accesses and post grace period
* updater's accesses is enforced by the below acquire semantic.
*/
- rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu);
- if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
+ rdp->watching_snap = ct_rcu_watching_cpu_acquire(rdp->cpu);
+ if (rcu_watching_snap_in_eqs(rdp->watching_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rdp->mynode, rdp);
return 1;
@@ -794,14 +805,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
/*
* Returns positive if the specified CPU has passed through a quiescent state
* by virtue of being in or having passed through an dynticks idle state since
- * the last call to dyntick_save_progress_counter() for this same CPU, or by
+ * the last call to rcu_watching_snap_save() for this same CPU, or by
* virtue of having been offline.
*
* Returns negative if the specified CPU needs a force resched.
*
* Returns zero otherwise.
*/
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+static int rcu_watching_snap_recheck(struct rcu_data *rdp)
{
unsigned long jtsq;
int ret = 0;
@@ -815,7 +826,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* read-side critical section that started before the beginning
* of the current RCU grace period.
*/
- if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) {
+ if (rcu_watching_snap_stopped_since(rdp, rdp->watching_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rnp, rdp);
return 1;
@@ -1649,7 +1660,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
* the done tail list manipulations are protected here.
*/
done = smp_load_acquire(&rcu_state.srs_done_tail);
- if (!done)
+ if (WARN_ON_ONCE(!done))
return;
WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
@@ -1984,10 +1995,10 @@ static void rcu_gp_fqs(bool first_time)
if (first_time) {
/* Collect dyntick-idle snapshots. */
- force_qs_rnp(dyntick_save_progress_counter);
+ force_qs_rnp(rcu_watching_snap_save);
} else {
/* Handle dyntick-idle and offline CPUs. */
- force_qs_rnp(rcu_implicit_dynticks_qs);
+ force_qs_rnp(rcu_watching_snap_recheck);
}
/* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
@@ -2383,7 +2394,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needacc = false;
struct rcu_node *rnp;
WARN_ON_ONCE(rdp->cpu != smp_processor_id());
@@ -2420,23 +2430,11 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
* to return true. So complain, but don't awaken.
*/
WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
- } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
- /*
- * ...but NOCB kthreads may miss or delay callbacks acceleration
- * if in the middle of a (de-)offloading process.
- */
- needacc = true;
}
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
-
- if (needacc) {
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_accelerate_cbs_unlocked(rnp, rdp);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
}
}
@@ -2791,24 +2789,6 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- /*
- * On RT rcu_core() can be preempted when IRQs aren't disabled.
- * Therefore this function can race with concurrent NOCB (de-)offloading
- * on this CPU and the below condition must be considered volatile.
- * However if we race with:
- *
- * _ Offloading: In the worst case we accelerate or process callbacks
- * concurrently with NOCB kthreads. We are guaranteed to
- * call rcu_nocb_lock() if that happens.
- *
- * _ Deoffloading: In the worst case we miss callbacks acceleration or
- * processing. This is fine because the early stage
- * of deoffloading invokes rcu_core() after setting
- * SEGCBLIST_RCU_CORE. So we guarantee that we'll process
- * what could have been dismissed without the need to wait
- * for the next rcu_pending() check in the next jiffy.
- */
- const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2828,17 +2808,17 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
- rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) {
+ local_irq_save(flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ local_irq_restore(flags);
}
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist) &&
likely(READ_ONCE(rcu_scheduler_fully_active))) {
rcu_do_batch(rdp);
/* Re-invoke RCU core processing if there are callbacks remaining. */
@@ -2855,7 +2835,7 @@ static __latent_entropy void rcu_core(void)
queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work);
}
-static void rcu_core_si(struct softirq_action *h)
+static void rcu_core_si(void)
{
rcu_core();
}
@@ -3227,7 +3207,7 @@ struct kvfree_rcu_bulk_data {
struct list_head list;
struct rcu_gp_oldstate gp_snap;
unsigned long nr_records;
- void *records[];
+ void *records[] __counted_by(nr_records);
};
/*
@@ -3539,10 +3519,10 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
if (delayed_work_pending(&krcp->monitor_work)) {
delay_left = krcp->monitor_work.timer.expires - jiffies;
if (delay < delay_left)
- mod_delayed_work(system_wq, &krcp->monitor_work, delay);
+ mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
return;
}
- queue_delayed_work(system_wq, &krcp->monitor_work, delay);
+ queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
}
static void
@@ -3634,7 +3614,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
// be that the work is in the pending state when
// channels have been detached following by each
// other.
- queue_rcu_work(system_wq, &krwp->rcu_work);
+ queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
}
}
@@ -3704,7 +3684,7 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
if (atomic_read(&krcp->backoff_page_cache_fill)) {
- queue_delayed_work(system_wq,
+ queue_delayed_work(system_unbound_wq,
&krcp->page_cache_work,
msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
} else {
@@ -3767,7 +3747,8 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
}
// Finally insert and update the GP for this page.
- bnode->records[bnode->nr_records++] = ptr;
+ bnode->nr_records++;
+ bnode->records[bnode->nr_records - 1] = ptr;
get_state_synchronize_rcu_full(&bnode->gp_snap);
atomic_inc(&(*krcp)->bulk_count[idx]);
@@ -4403,6 +4384,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
{
unsigned long __maybe_unused s = rcu_state.barrier_sequence;
+ rhp->next = rhp; // Mark the callback as having been invoked.
if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
rcu_barrier_trace(TPS("LastCB"), -1, s);
complete(&rcu_state.barrier_completion);
@@ -4804,8 +4786,8 @@ rcu_boot_init_percpu_data(int cpu)
/* Set up local state, ensuring consistent view of global state. */
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
INIT_WORK(&rdp->strict_work, strict_work_handler);
- WARN_ON_ONCE(ct->dynticks_nesting != 1);
- WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)));
+ WARN_ON_ONCE(ct->nesting != 1);
+ WARN_ON_ONCE(rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu)));
rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
rdp->rcu_ofl_gp_state = RCU_GP_CLEANED;
@@ -4898,7 +4880,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->blimit = blimit;
- ct->dynticks_nesting = 1; /* CPU not up, no tearing. */
+ ct->nesting = 1; /* CPU not up, no tearing. */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
@@ -5058,7 +5040,7 @@ void rcutree_report_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
mask = rdp->grpmask;
arch_spin_lock(&rcu_state.ofl_lock);
- rcu_dynticks_eqs_online();
+ rcu_watching_online();
raw_spin_lock(&rcu_state.barrier_lock);
raw_spin_lock_rcu_node(rnp);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
@@ -5424,6 +5406,8 @@ static void __init rcu_init_one(void)
while (i > rnp->grphi)
rnp++;
per_cpu_ptr(&rcu_data, i)->mynode = rnp;
+ per_cpu_ptr(&rcu_data, i)->barrier_head.next =
+ &per_cpu_ptr(&rcu_data, i)->barrier_head;
rcu_boot_init_percpu_data(i);
}
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fcf2b4aa3441..a9a811d9d7a3 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -206,7 +206,7 @@ struct rcu_data {
long blimit; /* Upper limit on a processed batch */
/* 3) dynticks interface. */
- int dynticks_snap; /* Per-GP tracking for dynticks. */
+ int watching_snap; /* Per-GP tracking for dynticks. */
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
@@ -215,7 +215,7 @@ struct rcu_data {
/* 4) rcu_barrier(), OOM callbacks, and expediting. */
unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */
struct rcu_head barrier_head;
- int exp_dynticks_snap; /* Double-check need for IPI. */
+ int exp_watching_snap; /* Double-check need for IPI. */
/* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -411,7 +411,6 @@ struct rcu_state {
arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
/* Synchronize offline with */
/* GP pre-initialization. */
- int nocb_is_setup; /* nocb is setup from boot */
/* synchronize_rcu() part. */
struct llist_head srs_next; /* request a GP users. */
@@ -420,6 +419,11 @@ struct rcu_state {
struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
struct work_struct srs_cleanup_work;
atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+ struct mutex nocb_mutex; /* Guards (de-)offloading */
+ int nocb_is_setup; /* nocb is setup from boot */
+#endif
};
/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 4acd29d16fdb..fb664d3a01c9 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -7,6 +7,7 @@
* Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/console.h>
#include <linux/lockdep.h>
static void rcu_exp_handler(void *unused);
@@ -376,11 +377,11 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
* post grace period updater's accesses is enforced by the
* below acquire semantic.
*/
- snap = ct_dynticks_cpu_acquire(cpu);
- if (rcu_dynticks_in_eqs(snap))
+ snap = ct_rcu_watching_cpu_acquire(cpu);
+ if (rcu_watching_snap_in_eqs(snap))
mask_ofl_test |= mask;
else
- rdp->exp_dynticks_snap = snap;
+ rdp->exp_watching_snap = snap;
}
}
mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
@@ -400,7 +401,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
unsigned long mask = rdp->grpmask;
retry_ipi:
- if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {
+ if (rcu_watching_snap_stopped_since(rdp, rdp->exp_watching_snap)) {
mask_ofl_test |= mask;
continue;
}
@@ -543,6 +544,67 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit)
}
/*
+ * Print out an expedited RCU CPU stall warning message.
+ */
+static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigned long j)
+{
+ int cpu;
+ unsigned long mask;
+ int ndetected;
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_root = rcu_get_root();
+
+ if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) {
+ pr_err("INFO: %s detected expedited stalls, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
+ return;
+ }
+ pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name);
+ ndetected = 0;
+ rcu_for_each_leaf_node(rnp) {
+ ndetected += rcu_print_task_exp_stall(rnp);
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ struct rcu_data *rdp;
+
+ mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(READ_ONCE(rnp->expmask) & mask))
+ continue;
+ ndetected++;
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ pr_cont(" %d-%c%c%c%c", cpu,
+ "O."[!!cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rnp->expmaskinit)],
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
+ "D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
+ }
+ }
+ pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+ j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask),
+ ".T"[!!data_race(rnp_root->exp_tasks)]);
+ if (ndetected) {
+ pr_err("blocking rcu_node structures (internal RCU debug):");
+ rcu_for_each_node_breadth_first(rnp) {
+ if (rnp == rnp_root)
+ continue; /* printed unconditionally */
+ if (sync_rcu_exp_done_unlocked(rnp))
+ continue;
+ pr_cont(" l=%u:%d-%d:%#lx/%c",
+ rnp->level, rnp->grplo, rnp->grphi, data_race(rnp->expmask),
+ ".T"[!!data_race(rnp->exp_tasks)]);
+ }
+ pr_cont("\n");
+ }
+ rcu_for_each_leaf_node(rnp) {
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(READ_ONCE(rnp->expmask) & mask))
+ continue;
+ dump_cpu_task(cpu);
+ }
+ rcu_exp_print_detail_task_stall_rnp(rnp);
+ }
+}
+
+/*
* Wait for the expedited grace period to elapse, issuing any needed
* RCU CPU stall warnings along the way.
*/
@@ -553,10 +615,8 @@ static void synchronize_rcu_expedited_wait(void)
unsigned long jiffies_stall;
unsigned long jiffies_start;
unsigned long mask;
- int ndetected;
struct rcu_data *rdp;
struct rcu_node *rnp;
- struct rcu_node *rnp_root = rcu_get_root();
unsigned long flags;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
@@ -590,59 +650,17 @@ static void synchronize_rcu_expedited_wait(void)
return;
if (rcu_stall_is_suppressed())
continue;
+
+ nbcon_cpu_emergency_enter();
+
j = jiffies;
rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
- pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
- rcu_state.name);
- ndetected = 0;
- rcu_for_each_leaf_node(rnp) {
- ndetected += rcu_print_task_exp_stall(rnp);
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- struct rcu_data *rdp;
-
- mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(READ_ONCE(rnp->expmask) & mask))
- continue;
- ndetected++;
- rdp = per_cpu_ptr(&rcu_data, cpu);
- pr_cont(" %d-%c%c%c%c", cpu,
- "O."[!!cpu_online(cpu)],
- "o."[!!(rdp->grpmask & rnp->expmaskinit)],
- "N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
- "D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
- }
- }
- pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
- j - jiffies_start, rcu_state.expedited_sequence,
- data_race(rnp_root->expmask),
- ".T"[!!data_race(rnp_root->exp_tasks)]);
- if (ndetected) {
- pr_err("blocking rcu_node structures (internal RCU debug):");
- rcu_for_each_node_breadth_first(rnp) {
- if (rnp == rnp_root)
- continue; /* printed unconditionally */
- if (sync_rcu_exp_done_unlocked(rnp))
- continue;
- pr_cont(" l=%u:%d-%d:%#lx/%c",
- rnp->level, rnp->grplo, rnp->grphi,
- data_race(rnp->expmask),
- ".T"[!!data_race(rnp->exp_tasks)]);
- }
- pr_cont("\n");
- }
- rcu_for_each_leaf_node(rnp) {
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(READ_ONCE(rnp->expmask) & mask))
- continue;
- preempt_disable(); // For smp_processor_id() in dump_cpu_task().
- dump_cpu_task(cpu);
- preempt_enable();
- }
- rcu_exp_print_detail_task_stall_rnp(rnp);
- }
+ synchronize_rcu_expedited_stall(jiffies_start, j);
jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
+
+ nbcon_cpu_emergency_exit();
+
panic_on_rcu_stall();
}
}
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 3ce30841119a..97b99cd06923 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -16,10 +16,6 @@
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return lockdep_is_held(&rdp->nocb_lock);
-}
static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
{
@@ -220,7 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
if (needwake) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
- wake_up_process(rdp_gp->nocb_gp_kthread);
+ swake_up_one_online(&rdp_gp->nocb_gp_wq);
}
return needwake;
@@ -413,14 +409,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
return false;
}
- // In the process of (de-)offloading: no bypassing, but
- // locking.
- if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false; /* Not offloaded, no bypassing. */
- }
-
// Don't use ->nocb_bypass during early boot.
if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
rcu_nocb_lock(rdp);
@@ -505,7 +493,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
}
rcu_nocb_bypass_unlock(rdp);
- smp_mb(); /* Order enqueue before wake. */
+
// A wake up of the grace period kthread or timer adjustment
// needs to be done only if:
// 1. Bypass list was fully empty before (this is the first
@@ -616,37 +604,33 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
}
}
-static int nocb_gp_toggle_rdp(struct rcu_data *rdp)
+static void nocb_gp_toggle_rdp(struct rcu_data *rdp_gp, struct rcu_data *rdp)
{
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
- int ret;
- rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
- !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * Locking orders future de-offloaded callbacks enqueue against previous
+ * handling of this rdp. Ie: Make sure rcuog is done with this rdp before
+ * deoffloaded callbacks can be enqueued.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
/*
* Offloading. Set our flag and notify the offload worker.
* We will handle this rdp until it ever gets de-offloaded.
*/
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- ret = 1;
- } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp);
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_OFFLOADED);
+ } else {
/*
* De-offloading. Clear our flag and notify the de-offload worker.
* We will ignore this rdp until it ever gets re-offloaded.
*/
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- ret = 0;
- } else {
- WARN_ON_ONCE(1);
- ret = -1;
+ list_del(&rdp->nocb_entry_rdp);
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_OFFLOADED);
}
-
- rcu_nocb_unlock_irqrestore(rdp, flags);
-
- return ret;
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
@@ -853,14 +837,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (rdp_toggling) {
- int ret;
-
- ret = nocb_gp_toggle_rdp(rdp_toggling);
- if (ret == 1)
- list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
- else if (ret == 0)
- list_del(&rdp_toggling->nocb_entry_rdp);
-
+ nocb_gp_toggle_rdp(my_rdp, rdp_toggling);
swake_up_one(&rdp_toggling->nocb_state_wq);
}
@@ -917,7 +894,7 @@ static void nocb_cb_wait(struct rcu_data *rdp)
WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
local_irq_save(flags);
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
local_irq_restore(flags);
/*
* Disable BH to provide the expected environment. Also, when
@@ -1030,16 +1007,11 @@ void rcu_nocb_flush_deferred_wakeup(void)
}
EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
-static int rdp_offload_toggle(struct rcu_data *rdp,
- bool offload, unsigned long flags)
- __releases(rdp->nocb_lock)
+static int rcu_nocb_queue_toggle_rdp(struct rcu_data *rdp)
{
- struct rcu_segcblist *cblist = &rdp->cblist;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
bool wake_gp = false;
-
- rcu_segcblist_offload(cblist, offload);
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ unsigned long flags;
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
// Queue this rdp for add/del to/from the list to iterate on rcuog
@@ -1053,88 +1025,73 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
return wake_gp;
}
-static long rcu_nocb_rdp_deoffload(void *arg)
+static bool rcu_nocb_rdp_deoffload_wait_cond(struct rcu_data *rdp)
{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
- int wake_gp;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool ret;
/*
- * rcu_nocb_rdp_deoffload() may be called directly if
- * rcuog/o[p] spawn failed, because at this time the rdp->cpu
- * is not online yet.
+ * Locking makes sure rcuog is done handling this rdp before deoffloaded
+ * enqueue can happen. Also it keeps the SEGCBLIST_OFFLOADED flag stable
+ * while the ->nocb_lock is held.
*/
- WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu));
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ ret = !rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ return ret;
+}
+
+static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ /* CPU must be offline, unless it's early boot */
+ WARN_ON_ONCE(cpu_online(rdp->cpu) && rdp->cpu != raw_smp_processor_id());
pr_info("De-offloading %d\n", rdp->cpu);
- rcu_nocb_lock_irqsave(rdp, flags);
- /*
- * Flush once and for all now. This suffices because we are
- * running on the target CPU holding ->nocb_lock (thus having
- * interrupts disabled), and because rdp_offload_toggle()
- * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
- * Thus future calls to rcu_segcblist_completely_offloaded() will
- * return false, which means that future calls to rcu_nocb_try_bypass()
- * will refuse to put anything into the bypass.
- */
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ /* Flush all callbacks from segcblist and bypass */
+ rcu_barrier();
+
/*
- * Start with invoking rcu_core() early. This way if the current thread
- * happens to preempt an ongoing call to rcu_core() in the middle,
- * leaving some work dismissed because rcu_core() still thinks the rdp is
- * completely offloaded, we are guaranteed a nearby future instance of
- * rcu_core() to catch up.
+ * Make sure the rcuoc kthread isn't in the middle of a nocb locked
+ * sequence while offloading is deactivated, along with nocb locking.
*/
- rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
- invoke_rcu_core();
- wake_gp = rdp_offload_toggle(rdp, false, flags);
+ if (rdp->nocb_cb_kthread)
+ kthread_park(rdp->nocb_cb_kthread);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+
if (rdp_gp->nocb_gp_kthread) {
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist,
- SEGCBLIST_KTHREAD_GP));
- if (rdp->nocb_cb_kthread)
- kthread_park(rdp->nocb_cb_kthread);
+ rcu_nocb_rdp_deoffload_wait_cond(rdp));
} else {
/*
* No kthread to clear the flags for us or remove the rdp from the nocb list
* to iterate. Do it here instead. Locking doesn't look stricly necessary
* but we stick to paranoia in this rare path.
*/
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
list_del(&rdp->nocb_entry_rdp);
}
- mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
-
- /*
- * Lock one last time to acquire latest callback updates from kthreads
- * so we can later handle callbacks locally without locking.
- */
- rcu_nocb_lock_irqsave(rdp, flags);
- /*
- * Theoretically we could clear SEGCBLIST_LOCKING after the nocb
- * lock is released but how about being paranoid for once?
- */
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING);
- /*
- * Without SEGCBLIST_LOCKING, we can't use
- * rcu_nocb_unlock_irqrestore() anymore.
- */
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-
- /* Sanity check */
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
return 0;
}
@@ -1145,33 +1102,42 @@ int rcu_nocb_cpu_deoffload(int cpu)
int ret = 0;
cpus_read_lock();
- mutex_lock(&rcu_state.barrier_mutex);
+ mutex_lock(&rcu_state.nocb_mutex);
if (rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ if (!cpu_online(cpu)) {
+ ret = rcu_nocb_rdp_deoffload(rdp);
if (!ret)
cpumask_clear_cpu(cpu, rcu_nocb_mask);
} else {
- pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu);
+ pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu);
ret = -EINVAL;
}
}
- mutex_unlock(&rcu_state.barrier_mutex);
+ mutex_unlock(&rcu_state.nocb_mutex);
cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
-static long rcu_nocb_rdp_offload(void *arg)
+static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp)
{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
+ bool ret;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ ret = rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ return ret;
+}
+
+static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
+{
int wake_gp;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ WARN_ON_ONCE(cpu_online(rdp->cpu));
/*
* For now we only support re-offload, ie: the rdp must have been
* offloaded on boot first.
@@ -1184,44 +1150,17 @@ static long rcu_nocb_rdp_offload(void *arg)
pr_info("Offloading %d\n", rdp->cpu);
- /*
- * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
- * is set.
- */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
- /*
- * We didn't take the nocb lock while working on the
- * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode).
- * Every modifications that have been done previously on
- * rdp->cblist must be visible remotely by the nocb kthreads
- * upon wake up after reading the cblist flags.
- *
- * The layout against nocb_lock enforces that ordering:
- *
- * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
- * ------------------------- ----------------------------
- * WRITE callbacks rcu_nocb_lock()
- * rcu_nocb_lock() READ flags
- * WRITE flags READ callbacks
- * rcu_nocb_unlock() rcu_nocb_unlock()
- */
- wake_gp = rdp_offload_toggle(rdp, true, flags);
+ wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
- kthread_unpark(rdp->nocb_cb_kthread);
-
swait_event_exclusive(rdp->nocb_state_wq,
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ rcu_nocb_rdp_offload_wait_cond(rdp));
- /*
- * All kthreads are ready to work, we can finally relieve rcu_core() and
- * enable nocb bypass.
- */
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ kthread_unpark(rdp->nocb_cb_kthread);
return 0;
}
@@ -1232,18 +1171,18 @@ int rcu_nocb_cpu_offload(int cpu)
int ret = 0;
cpus_read_lock();
- mutex_lock(&rcu_state.barrier_mutex);
+ mutex_lock(&rcu_state.nocb_mutex);
if (!rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ if (!cpu_online(cpu)) {
+ ret = rcu_nocb_rdp_offload(rdp);
if (!ret)
cpumask_set_cpu(cpu, rcu_nocb_mask);
} else {
- pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu);
+ pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu);
ret = -EINVAL;
}
}
- mutex_unlock(&rcu_state.barrier_mutex);
+ mutex_unlock(&rcu_state.nocb_mutex);
cpus_read_unlock();
return ret;
@@ -1261,7 +1200,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
return 0;
/* Protect rcu_nocb_mask against concurrent (de-)offloading. */
- if (!mutex_trylock(&rcu_state.barrier_mutex))
+ if (!mutex_trylock(&rcu_state.nocb_mutex))
return 0;
/* Snapshot count of all CPUs */
@@ -1271,7 +1210,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
count += READ_ONCE(rdp->lazy_len);
}
- mutex_unlock(&rcu_state.barrier_mutex);
+ mutex_unlock(&rcu_state.nocb_mutex);
return count ? count : SHRINK_EMPTY;
}
@@ -1289,9 +1228,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
* Protect against concurrent (de-)offloading. Otherwise nocb locking
* may be ignored or imbalanced.
*/
- if (!mutex_trylock(&rcu_state.barrier_mutex)) {
+ if (!mutex_trylock(&rcu_state.nocb_mutex)) {
/*
- * But really don't insist if barrier_mutex is contended since we
+ * But really don't insist if nocb_mutex is contended since we
* can't guarantee that it will never engage in a dependency
* chain involving memory allocation. The lock is seldom contended
* anyway.
@@ -1330,7 +1269,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
break;
}
- mutex_unlock(&rcu_state.barrier_mutex);
+ mutex_unlock(&rcu_state.nocb_mutex);
return count ? count : SHRINK_STOP;
}
@@ -1396,9 +1335,7 @@ void __init rcu_init_nohz(void)
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
- rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
}
rcu_organize_nocb_kthreads();
}
@@ -1446,7 +1383,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
"rcuog/%d", rdp_gp->cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
- goto end;
+ goto err;
}
WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
if (kthread_prio)
@@ -1458,7 +1395,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
t = kthread_create(rcu_nocb_cb_kthread, rdp,
"rcuo%c/%d", rcu_state.abbr, cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
- goto end;
+ goto err;
if (rcu_rdp_is_offloaded(rdp))
wake_up_process(t);
@@ -1471,13 +1408,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
WRITE_ONCE(rdp->nocb_cb_kthread, t);
WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
return;
-end:
- mutex_lock(&rcu_state.barrier_mutex);
+
+err:
+ /*
+ * No need to protect against concurrent rcu_barrier()
+ * because the number of callbacks should be 0 for a non-boot CPU,
+ * therefore rcu_barrier() shouldn't even try to grab the nocb_lock.
+ * But hold nocb_mutex to avoid nocb_lock imbalance from shrinker.
+ */
+ WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist));
+ mutex_lock(&rcu_state.nocb_mutex);
if (rcu_rdp_is_offloaded(rdp)) {
rcu_nocb_rdp_deoffload(rdp);
cpumask_clear_cpu(cpu, rcu_nocb_mask);
}
- mutex_unlock(&rcu_state.barrier_mutex);
+ mutex_unlock(&rcu_state.nocb_mutex);
}
/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
@@ -1653,16 +1598,6 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return 0;
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
- return false;
-}
-
/* No ->nocb_lock to acquire. */
static void rcu_nocb_lock(struct rcu_data *rdp)
{
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c569da65b421..1c7cbd145d5e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -24,10 +24,11 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
* timers have their own means of synchronization against the
* offloaded state updaters.
*/
- RCU_LOCKDEP_WARN(
+ RCU_NOCB_LOCKDEP_WARN(
!(lockdep_is_held(&rcu_state.barrier_mutex) ||
(IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
- rcu_lockdep_is_held_nocb(rdp) ||
+ lockdep_is_held(&rdp->nocb_lock) ||
+ lockdep_is_held(&rcu_state.nocb_mutex) ||
(!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) &&
rdp == this_cpu_ptr(&rcu_data)) ||
rcu_current_is_nocb_kthread(rdp)),
@@ -869,7 +870,7 @@ static void rcu_qs(void)
/*
* Register an urgently needed quiescent state. If there is an
- * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * emergency, invoke rcu_momentary_eqs() to do a heavy-weight
* dyntick-idle quiescent state visible to other CPUs, which will in
* some cases serve for expedited as well as normal grace periods.
* Either way, register a lightweight quiescent state.
@@ -889,7 +890,7 @@ void rcu_all_qs(void)
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
local_irq_save(flags);
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
local_irq_restore(flags);
}
rcu_qs();
@@ -909,7 +910,7 @@ void rcu_note_context_switch(bool preempt)
goto out;
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
out:
rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 4b0e9d7c4c68..4432db6d0b99 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -7,8 +7,10 @@
* Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/console.h>
#include <linux/kvm_para.h>
#include <linux/rcu_notifier.h>
+#include <linux/smp.h>
//////////////////////////////////////////////////////////////////////////////
//
@@ -370,6 +372,7 @@ static void rcu_dump_cpu_stacks(void)
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp) {
+ printk_deferred_enter();
raw_spin_lock_irqsave_rcu_node(rnp, flags);
for_each_leaf_node_possible_cpu(rnp, cpu)
if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
@@ -379,6 +382,7 @@ static void rcu_dump_cpu_stacks(void)
dump_cpu_task(cpu);
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ printk_deferred_exit();
}
}
@@ -501,7 +505,7 @@ static void print_cpu_stall_info(int cpu)
}
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
- rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu));
+ rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu));
rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
if (rcuc_starved)
// Print signed value, as negative values indicate a probable bug.
@@ -515,8 +519,8 @@ static void print_cpu_stall_info(int cpu)
rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
"!."[!delta],
ticks_value, ticks_title,
- ct_dynticks_cpu(cpu) & 0xffff,
- ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
+ ct_rcu_watching_cpu(cpu) & 0xffff,
+ ct_nesting_cpu(cpu), ct_nmi_nesting_cpu(cpu),
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
rcuc_starved ? buf : "",
@@ -605,6 +609,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
if (rcu_stall_is_suppressed())
return;
+ nbcon_cpu_emergency_enter();
+
/*
* OK, time to rat on our buddy...
* See Documentation/RCU/stallwarn.rst for info on how to debug
@@ -657,6 +663,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
+ nbcon_cpu_emergency_exit();
+
panic_on_rcu_stall();
rcu_force_quiescent_state(); /* Kick them all. */
@@ -677,6 +685,8 @@ static void print_cpu_stall(unsigned long gps)
if (rcu_stall_is_suppressed())
return;
+ nbcon_cpu_emergency_enter();
+
/*
* OK, time to rat on ourselves...
* See Documentation/RCU/stallwarn.rst for info on how to debug
@@ -706,6 +716,8 @@ static void print_cpu_stall(unsigned long gps)
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ nbcon_cpu_emergency_exit();
+
panic_on_rcu_stall();
/*
@@ -719,6 +731,9 @@ static void print_cpu_stall(unsigned long gps)
set_preempt_need_resched();
}
+static bool csd_lock_suppress_rcu_stall;
+module_param(csd_lock_suppress_rcu_stall, bool, 0644);
+
static void check_cpu_stall(struct rcu_data *rdp)
{
bool self_detected;
@@ -791,7 +806,9 @@ static void check_cpu_stall(struct rcu_data *rdp)
return;
rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
- if (self_detected) {
+ if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) {
+ pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
+ } else if (self_detected) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(gps);
} else {
diff --git a/kernel/resource.c b/kernel/resource.c
index 14777afb0a99..a83040fde236 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1826,8 +1826,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size,
if (flags & GFR_DESCENDING) {
resource_size_t end;
- end = min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ end = min_t(resource_size_t, base->end, PHYSMEM_END);
return end - size + 1;
}
@@ -1844,8 +1843,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr,
* @size did not wrap 0.
*/
return addr > addr - size &&
- addr <= min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ addr <= min_t(resource_size_t, base->end, PHYSMEM_END);
}
static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a9f655025607..1d7f5941bcdc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5762,7 +5762,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
- SCHED_WARN_ON(ct_state() == CONTEXT_USER);
+ SCHED_WARN_ON(ct_state() == CT_STATE_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -6658,7 +6658,7 @@ asmlinkage __visible void __sched schedule_user(void)
* we find a better solution.
*
* NB: There are buggy callers of this function. Ideally we
- * should warn if prev_state != CONTEXT_USER, but that will trigger
+ * should warn if prev_state != CT_STATE_USER, but that will trigger
* too frequently to make sense yet.
*/
enum ctx_state prev_state = exception_enter();
@@ -7845,6 +7845,30 @@ void set_rq_offline(struct rq *rq)
}
}
+static inline void sched_set_rq_online(struct rq *rq, int cpu)
+{
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_online(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+static inline void sched_set_rq_offline(struct rq *rq, int cpu)
+{
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/*
* used to mark begin/end of suspend/resume:
*/
@@ -7895,10 +7919,25 @@ static int cpuset_cpu_inactive(unsigned int cpu)
return 0;
}
+static inline void sched_smt_present_inc(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_inc_cpuslocked(&sched_smt_present);
+#endif
+}
+
+static inline void sched_smt_present_dec(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_dec_cpuslocked(&sched_smt_present);
+#endif
+}
+
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
/*
* Clear the balance_push callback and prepare to schedule
@@ -7906,13 +7945,10 @@ int sched_cpu_activate(unsigned int cpu)
*/
balance_push_set(cpu, false);
-#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
- static_branch_inc_cpuslocked(&sched_smt_present);
-#endif
+ sched_smt_present_inc(cpu);
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
@@ -7930,12 +7966,7 @@ int sched_cpu_activate(unsigned int cpu)
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
*/
- rq_lock_irqsave(rq, &rf);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_online(rq);
- }
- rq_unlock_irqrestore(rq, &rf);
+ sched_set_rq_online(rq, cpu);
return 0;
}
@@ -7943,7 +7974,6 @@ int sched_cpu_activate(unsigned int cpu)
int sched_cpu_deactivate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
int ret;
/*
@@ -7974,20 +8004,14 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu();
- rq_lock_irqsave(rq, &rf);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
- }
- rq_unlock_irqrestore(rq, &rf);
+ sched_set_rq_offline(rq, cpu);
-#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
- static_branch_dec_cpuslocked(&sched_smt_present);
+ sched_smt_present_dec(cpu);
+#ifdef CONFIG_SCHED_SMT
sched_core_cpu_deactivate(cpu);
#endif
@@ -7997,6 +8021,8 @@ int sched_cpu_deactivate(unsigned int cpu)
sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
+ sched_smt_present_inc(cpu);
+ sched_set_rq_online(rq, cpu);
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
sched_update_numa(cpu, true);
@@ -9726,7 +9752,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
void dump_cpu_task(int cpu)
{
- if (cpu == smp_processor_id() && in_hardirq()) {
+ if (in_hardirq() && cpu == smp_processor_id()) {
struct pt_regs *regs;
regs = get_irq_regs();
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a5e00293ae43..0bed0fa1acd9 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -582,6 +582,12 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
}
stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
+ /*
+ * Because mul_u64_u64_div_u64() can approximate on some
+ * achitectures; enforce the constraint that: a*b/(b+c) <= a.
+ */
+ if (unlikely(stime > rtime))
+ stime = rtime;
update:
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9057584ec06d..8dc9385f6da4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12483,7 +12483,7 @@ out:
* - indirectly from a remote scheduler_tick() for NOHZ idle balancing
* through the SMP cross-call nohz_csd_func()
*/
-static __latent_entropy void sched_balance_softirq(struct softirq_action *h)
+static __latent_entropy void sched_balance_softirq(void)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance;
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 78e48f5426ee..eb0cdcd4d921 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -92,16 +92,6 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
trace_sched_stat_blocked(p, delta);
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(p),
- delta >> 20);
- }
account_scheduler_latency(p, delta >> 10, 0);
}
}
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index ae1b42775ef9..195d2f2834a9 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -406,6 +406,14 @@ static void __setscheduler_params(struct task_struct *p,
else if (fair_policy(policy))
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ /* rt-policy tasks do not have a timerslack */
+ if (task_is_realtime(p)) {
+ p->timer_slack_ns = 0;
+ } else if (p->timer_slack_ns == 0) {
+ /* when switching back to non-rt policy, restore timerslack */
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ }
+
/*
* __sched_setscheduler() ensures attr->sched_priority == 0 when
* !rt_policy. Always setting this ensures that things like
diff --git a/kernel/signal.c b/kernel/signal.c
index 60c737e423a1..6f3a5aa39b09 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -618,20 +618,18 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
}
/*
- * Dequeue a signal and return the element to the caller, which is
- * expected to free it.
- *
- * All callers have to hold the siglock.
+ * Try to dequeue a signal. If a deliverable signal is found fill in the
+ * caller provided siginfo and return the signal number. Otherwise return
+ * 0.
*/
-int dequeue_signal(struct task_struct *tsk, sigset_t *mask,
- kernel_siginfo_t *info, enum pid_type *type)
+int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type)
{
+ struct task_struct *tsk = current;
bool resched_timer = false;
int signr;
- /* We only dequeue private signals from ourselves, we don't let
- * signalfd steal them
- */
+ lockdep_assert_held(&tsk->sighand->siglock);
+
*type = PIDTYPE_PID;
signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
@@ -1940,10 +1938,11 @@ struct sigqueue *sigqueue_alloc(void)
void sigqueue_free(struct sigqueue *q)
{
- unsigned long flags;
spinlock_t *lock = &current->sighand->siglock;
+ unsigned long flags;
- BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+ if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC)))
+ return;
/*
* We must hold ->siglock while testing q->list
* to serialize with collect_signal() or with
@@ -1971,7 +1970,10 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
unsigned long flags;
int ret, result;
- BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+ if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC)))
+ return 0;
+ if (WARN_ON_ONCE(q->info.si_code != SI_TIMER))
+ return 0;
ret = -1;
rcu_read_lock();
@@ -2006,7 +2008,6 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
* If an SI_TIMER entry is already queue just increment
* the overrun count.
*/
- BUG_ON(q->info.si_code != SI_TIMER);
q->info.si_overrun++;
result = TRACE_SIGNAL_ALREADY_PENDING;
goto out;
@@ -2793,8 +2794,7 @@ relock:
type = PIDTYPE_PID;
signr = dequeue_synchronous_signal(&ksig->info);
if (!signr)
- signr = dequeue_signal(current, &current->blocked,
- &ksig->info, &type);
+ signr = dequeue_signal(&current->blocked, &ksig->info, &type);
if (!signr)
break; /* will return 0 */
@@ -3648,7 +3648,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
signotset(&mask);
spin_lock_irq(&tsk->sighand->siglock);
- sig = dequeue_signal(tsk, &mask, info, &type);
+ sig = dequeue_signal(&mask, info, &type);
if (!sig && timeout) {
/*
* None ready, temporarily unblock those we're interested
@@ -3667,7 +3667,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
sigemptyset(&tsk->real_blocked);
- sig = dequeue_signal(tsk, &mask, info, &type);
+ sig = dequeue_signal(&mask, info, &type);
}
spin_unlock_irq(&tsk->sighand->siglock);
diff --git a/kernel/smp.c b/kernel/smp.c
index aaffecdad319..f25e20617b7e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -208,12 +208,25 @@ static int csd_lock_wait_getcpu(call_single_data_t *csd)
return -1;
}
+static atomic_t n_csd_lock_stuck;
+
+/**
+ * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long?
+ *
+ * Returns @true if a CSD-lock acquisition is stuck and has been stuck
+ * long enough for a "non-responsive CSD lock" message to be printed.
+ */
+bool csd_lock_is_stuck(void)
+{
+ return !!atomic_read(&n_csd_lock_stuck);
+}
+
/*
* Complain if too much time spent waiting. Note that only
* the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
* so waiting on other types gets much less information.
*/
-static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages)
{
int cpu = -1;
int cpux;
@@ -229,15 +242,26 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
cpu = csd_lock_wait_getcpu(csd);
pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
*bug_id, raw_smp_processor_id(), cpu);
+ atomic_dec(&n_csd_lock_stuck);
return true;
}
ts2 = sched_clock();
/* How long since we last checked for a stuck CSD lock.*/
ts_delta = ts2 - *ts1;
- if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
+ if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) *
+ (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) ||
+ csd_lock_timeout_ns == 0))
return false;
+ if (ts0 > ts2) {
+ /* Our own sched_clock went backward; don't blame another CPU. */
+ ts_delta = ts0 - ts2;
+ pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta);
+ *ts1 = ts2;
+ return false;
+ }
+
firsttime = !*bug_id;
if (firsttime)
*bug_id = atomic_inc_return(&csd_bug_count);
@@ -249,9 +273,12 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
/* How long since this CSD lock was stuck. */
ts_delta = ts2 - ts0;
- pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
- firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
+ pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n",
+ firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta,
cpu, csd->func, csd->info);
+ (*nmessages)++;
+ if (firsttime)
+ atomic_inc(&n_csd_lock_stuck);
/*
* If the CSD lock is still stuck after 5 minutes, it is unlikely
* to become unstuck. Use a signed comparison to avoid triggering
@@ -290,12 +317,13 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
*/
static void __csd_lock_wait(call_single_data_t *csd)
{
+ unsigned long nmessages = 0;
int bug_id = 0;
u64 ts0, ts1;
ts1 = ts0 = sched_clock();
for (;;) {
- if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
+ if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages))
break;
cpu_relax();
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 02582017759a..d082e7840f88 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -551,7 +551,7 @@ restart:
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
- h->action(h);
+ h->action();
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
@@ -700,7 +700,7 @@ void __raise_softirq_irqoff(unsigned int nr)
or_softirq_pending(1UL << nr);
}
-void open_softirq(int nr, void (*action)(struct softirq_action *))
+void open_softirq(int nr, void (*action)(void))
{
softirq_vec[nr].action = action;
}
@@ -760,8 +760,7 @@ static bool tasklet_clear_sched(struct tasklet_struct *t)
return false;
}
-static void tasklet_action_common(struct softirq_action *a,
- struct tasklet_head *tl_head,
+static void tasklet_action_common(struct tasklet_head *tl_head,
unsigned int softirq_nr)
{
struct tasklet_struct *list;
@@ -805,16 +804,16 @@ static void tasklet_action_common(struct softirq_action *a,
}
}
-static __latent_entropy void tasklet_action(struct softirq_action *a)
+static __latent_entropy void tasklet_action(void)
{
workqueue_softirq_action(false);
- tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
+ tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
}
-static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
+static __latent_entropy void tasklet_hi_action(void)
{
workqueue_softirq_action(true);
- tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
+ tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
}
void tasklet_setup(struct tasklet_struct *t,
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index cedb17ba158a..da821ce258ea 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -251,7 +251,7 @@ static int multi_cpu_stop(void *data)
*/
touch_nmi_watchdog();
}
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
} while (curstate != MULTI_STOP_EXIT);
local_irq_restore(flags);
diff --git a/kernel/sys.c b/kernel/sys.c
index 3a2df1bd9f64..e3c4cffb520c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2557,6 +2557,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = current->timer_slack_ns;
break;
case PR_SET_TIMERSLACK:
+ if (task_is_realtime(current))
+ break;
if (arg2 <= 0)
current->timer_slack_ns =
current->default_timer_slack_ns;
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 5c2daa7ad3f9..5d14d639ac71 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -6,12 +6,14 @@
static struct callback_head work_exited; /* all we need is ->next == NULL */
+#ifdef CONFIG_IRQ_WORK
static void task_work_set_notify_irq(struct irq_work *entry)
{
test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
}
static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
+#endif
/**
* task_work_add - ask the @task to execute @work->func()
@@ -57,6 +59,8 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
if (notify == TWA_NMI_CURRENT) {
if (WARN_ON_ONCE(task != current))
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_IRQ_WORK))
+ return -EINVAL;
} else {
/* record the work call stack in order to print it in KASAN reports */
kasan_record_aux_stack(work);
@@ -81,9 +85,11 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
case TWA_SIGNAL_NO_IPI:
__set_notify_signal(task);
break;
+#ifdef CONFIG_IRQ_WORK
case TWA_NMI_CURRENT:
irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
break;
+#endif
default:
WARN_ON_ONCE(1);
break;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5abfa4390673..8bf888641694 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -493,7 +493,7 @@ static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throt
* promised in the context of posix_timer_fn() never
* materialized, but someone should really work on it.
*
- * To prevent DOS fake @now to be 1 jiffie out which keeps
+ * To prevent DOS fake @now to be 1 jiffy out which keeps
* the overrun accounting correct but creates an
* inconsistency vs. timer_gettime(2).
*/
@@ -574,15 +574,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
it.alarm.alarmtimer);
enum alarmtimer_restart result = ALARMTIMER_NORESTART;
unsigned long flags;
- int si_private = 0;
spin_lock_irqsave(&ptr->it_lock, flags);
- ptr->it_active = 0;
- if (ptr->it_interval)
- si_private = ++ptr->it_requeue_pending;
-
- if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
+ if (posix_timer_queue_signal(ptr) && ptr->it_interval) {
/*
* Handle ignored signals and rearm the timer. This will go
* away once we handle ignored signals proper. Ensure that
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 60a6484831b1..78c7bd64d0dd 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -190,7 +190,7 @@ int clockevents_tick_resume(struct clock_event_device *dev)
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
-/* Limit min_delta to a jiffie */
+/* Limit min_delta to a jiffy */
#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
/**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d25ba49e313c..23336eecb4f4 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,6 @@ static u64 suspend_start;
/*
* Threshold: 0.0312s, when doubled: 0.0625s.
- * Also a default for cs->uncertainty_margin when registering clocks.
*/
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
@@ -125,6 +124,13 @@ static u64 suspend_start;
*
* The default of 500 parts per million is based on NTP's limits.
* If a clocksource is good enough for NTP, it is good enough for us!
+ *
+ * In other words, by default, even if a clocksource is extremely
+ * precise (for example, with a sub-nanosecond period), the maximum
+ * permissible skew between the clocksource watchdog and the clocksource
+ * under test is not permitted to go below the 500ppm minimum defined
+ * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the
+ * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option.
*/
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
@@ -132,6 +138,13 @@ static u64 suspend_start;
#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
#endif
+/*
+ * Default for maximum permissible skew when cs->uncertainty_margin is
+ * not specified, and the lower bound even when cs->uncertainty_margin
+ * is specified. This is also the default that is used when registering
+ * clocks with unspecifed cs->uncertainty_margin, so this macro is used
+ * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
+ */
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
@@ -231,6 +244,7 @@ enum wd_read_status {
static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
{
+ int64_t md = 2 * watchdog->uncertainty_margin;
unsigned int nretries, max_retries;
int64_t wd_delay, wd_seq_delay;
u64 wd_end, wd_end2;
@@ -245,8 +259,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
local_irq_enable();
wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
- if (wd_delay <= WATCHDOG_MAX_SKEW) {
- if (nretries > 1 || nretries >= max_retries) {
+ if (wd_delay <= md + cs->uncertainty_margin) {
+ if (nretries > 1 && nretries >= max_retries) {
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
smp_processor_id(), watchdog->name, nretries);
}
@@ -258,12 +272,12 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
* there is too much external interferences that cause
* significant delay in reading both clocksource and watchdog.
*
- * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
- * report system busy, reinit the watchdog and skip the current
+ * If consecutive WD read-back delay > md, report
+ * system busy, reinit the watchdog and skip the current
* watchdog test.
*/
wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
- if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
+ if (wd_seq_delay > md)
goto skip_test;
}
@@ -1146,14 +1160,19 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
}
/*
- * If the uncertainty margin is not specified, calculate it.
- * If both scale and freq are non-zero, calculate the clock
- * period, but bound below at 2*WATCHDOG_MAX_SKEW. However,
- * if either of scale or freq is zero, be very conservative and
- * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the
- * uncertainty margin. Allow stupidly small uncertainty margins
- * to be specified by the caller for testing purposes, but warn
- * to discourage production use of this capability.
+ * If the uncertainty margin is not specified, calculate it. If
+ * both scale and freq are non-zero, calculate the clock period, but
+ * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default.
+ * However, if either of scale or freq is zero, be very conservative
+ * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value
+ * for the uncertainty margin. Allow stupidly small uncertainty
+ * margins to be specified by the caller for testing purposes,
+ * but warn to discourage production use of this capability.
+ *
+ * Bottom line: The sum of the uncertainty margins of the
+ * watchdog clocksource and the clocksource under test will be at
+ * least 500ppm by default. For more information, please see the
+ * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above.
*/
if (scale && freq && !cs->uncertainty_margin) {
cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index b8ee320208d4..12eb40d6290e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1177,7 +1177,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
/*
* CONFIG_TIME_LOW_RES indicates that the system has no way to return
* granular time values. For relative timers we add hrtimer_resolution
- * (i.e. one jiffie) to prevent short timeouts.
+ * (i.e. one jiffy) to prevent short timeouts.
*/
timer->is_rel = mode & HRTIMER_MODE_REL;
if (timer->is_rel)
@@ -1351,11 +1351,13 @@ static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
}
static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
+ __acquires(&base->softirq_expiry_lock)
{
spin_lock(&base->softirq_expiry_lock);
}
static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
+ __releases(&base->softirq_expiry_lock)
{
spin_unlock(&base->softirq_expiry_lock);
}
@@ -1757,7 +1759,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
}
}
-static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
+static __latent_entropy void hrtimer_run_softirq(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
unsigned long flags;
@@ -2072,14 +2074,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
- u64 slack;
-
- slack = current->timer_slack_ns;
- if (rt_task(current))
- slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
- hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
+ hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
ret = do_nanosleep(&t, mode);
if (ret != -ERESTART_RESTARTBLOCK)
goto out;
@@ -2249,7 +2246,7 @@ void __init hrtimers_init(void)
/**
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
+ * @delta: slack in expires timeout (ktime_t)
* @mode: timer mode
* @clock_id: timer clock to be used
*/
@@ -2276,13 +2273,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
- /*
- * Override any slack passed by the user if under
- * rt contraints.
- */
- if (rt_task(current))
- delta = 0;
-
hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_sleeper_start_expires(&t, mode);
@@ -2302,7 +2292,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
/**
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
+ * @delta: slack in expires timeout (ktime_t)
* @mode: timer mode
*
* Make the current task sleep until the given expiry time has
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 406dccb79c2b..802b336f4b8c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -660,9 +660,17 @@ rearm:
sched_sync_hw_clock(offset_nsec, res != 0);
}
-void ntp_notify_cmos_timer(void)
+void ntp_notify_cmos_timer(bool offset_set)
{
/*
+ * If the time jumped (using ADJ_SETOFFSET) cancels sync timer,
+ * which may have been running if the time was synchronized
+ * prior to the ADJ_SETOFFSET call.
+ */
+ if (offset_set)
+ hrtimer_cancel(&sync_hrtimer);
+
+ /*
* When the work is currently executed but has not yet the timer
* rearmed this queues the work immediately again. No big issue,
* just a pointless work scheduled.
@@ -727,17 +735,16 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
}
if (txc->modes & ADJ_MAXERROR)
- time_maxerror = txc->maxerror;
+ time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT);
if (txc->modes & ADJ_ESTERROR)
- time_esterror = txc->esterror;
+ time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT);
if (txc->modes & ADJ_TIMECONST) {
- time_constant = txc->constant;
+ time_constant = clamp(txc->constant, 0, MAXTC);
if (!(time_status & STA_NANO))
time_constant += 4;
- time_constant = min(time_constant, (long)MAXTC);
- time_constant = max(time_constant, 0l);
+ time_constant = clamp(time_constant, 0, MAXTC);
}
if (txc->modes & ADJ_TAI &&
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 23d1b74c3065..5a633dce9057 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -14,9 +14,9 @@ extern int __do_adjtimex(struct __kernel_timex *txc,
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
-extern void ntp_notify_cmos_timer(void);
+extern void ntp_notify_cmos_timer(bool offset_set);
#else
-static inline void ntp_notify_cmos_timer(void) { }
+static inline void ntp_notify_cmos_timer(bool offset_set) { }
#endif
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e9c6f9d0e42c..6bcee4704059 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -453,6 +453,7 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p)
struct cpu_timer *ctmr = &timer->it.cpu;
struct posix_cputimer_base *base;
+ timer->it_active = 0;
if (!cpu_timer_dequeue(ctmr))
return;
@@ -559,6 +560,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p)
struct cpu_timer *ctmr = &timer->it.cpu;
u64 newexp = cpu_timer_getexpires(ctmr);
+ timer->it_active = 1;
if (!cpu_timer_enqueue(&base->tqhead, ctmr))
return;
@@ -584,12 +586,8 @@ static void cpu_timer_fire(struct k_itimer *timer)
{
struct cpu_timer *ctmr = &timer->it.cpu;
- if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
- /*
- * User don't want any signal.
- */
- cpu_timer_setexpires(ctmr, 0);
- } else if (unlikely(timer->sigq == NULL)) {
+ timer->it_active = 0;
+ if (unlikely(timer->sigq == NULL)) {
/*
* This a special case for clock_nanosleep,
* not a normal timer from sys_timer_create.
@@ -600,9 +598,9 @@ static void cpu_timer_fire(struct k_itimer *timer)
/*
* One-shot timer. Clear it as soon as it's fired.
*/
- posix_timer_event(timer, 0);
+ posix_timer_queue_signal(timer);
cpu_timer_setexpires(ctmr, 0);
- } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
+ } else if (posix_timer_queue_signal(timer)) {
/*
* The signal did not get queued because the signal
* was ignored, so we won't get any callback to
@@ -614,6 +612,8 @@ static void cpu_timer_fire(struct k_itimer *timer)
}
}
+static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now);
+
/*
* Guts of sys_timer_settime for CPU timers.
* This is called with the timer locked and interrupts disabled.
@@ -623,9 +623,10 @@ static void cpu_timer_fire(struct k_itimer *timer)
static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
struct itimerspec64 *new, struct itimerspec64 *old)
{
+ bool sigev_none = timer->it_sigev_notify == SIGEV_NONE;
clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
- u64 old_expires, new_expires, old_incr, val;
struct cpu_timer *ctmr = &timer->it.cpu;
+ u64 old_expires, new_expires, now;
struct sighand_struct *sighand;
struct task_struct *p;
unsigned long flags;
@@ -662,10 +663,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
return -ESRCH;
}
- /*
- * Disarm any old timer after extracting its expiry time.
- */
- old_incr = timer->it_interval;
+ /* Retrieve the current expiry time before disarming the timer */
old_expires = cpu_timer_getexpires(ctmr);
if (unlikely(timer->it.cpu.firing)) {
@@ -673,157 +671,122 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
ret = TIMER_RETRY;
} else {
cpu_timer_dequeue(ctmr);
+ timer->it_active = 0;
}
/*
- * We need to sample the current value to convert the new
- * value from to relative and absolute, and to convert the
- * old value from absolute to relative. To set a process
- * timer, we need a sample to balance the thread expiry
- * times (in arm_timer). With an absolute time, we must
- * check if it's already passed. In short, we need a sample.
+ * Sample the current clock for saving the previous setting
+ * and for rearming the timer.
*/
if (CPUCLOCK_PERTHREAD(timer->it_clock))
- val = cpu_clock_sample(clkid, p);
+ now = cpu_clock_sample(clkid, p);
else
- val = cpu_clock_sample_group(clkid, p, true);
+ now = cpu_clock_sample_group(clkid, p, !sigev_none);
+ /* Retrieve the previous expiry value if requested. */
if (old) {
- if (old_expires == 0) {
- old->it_value.tv_sec = 0;
- old->it_value.tv_nsec = 0;
- } else {
- /*
- * Update the timer in case it has overrun already.
- * If it has, we'll report it as having overrun and
- * with the next reloaded timer already ticking,
- * though we are swallowing that pending
- * notification here to install the new setting.
- */
- u64 exp = bump_cpu_timer(timer, val);
-
- if (val < exp) {
- old_expires = exp - val;
- old->it_value = ns_to_timespec64(old_expires);
- } else {
- old->it_value.tv_nsec = 1;
- old->it_value.tv_sec = 0;
- }
- }
+ old->it_value = (struct timespec64){ };
+ if (old_expires)
+ __posix_cpu_timer_get(timer, old, now);
}
+ /* Retry if the timer expiry is running concurrently */
if (unlikely(ret)) {
- /*
- * We are colliding with the timer actually firing.
- * Punt after filling in the timer's old value, and
- * disable this firing since we are already reporting
- * it as an overrun (thanks to bump_cpu_timer above).
- */
unlock_task_sighand(p, &flags);
goto out;
}
- if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
- new_expires += val;
- }
+ /* Convert relative expiry time to absolute */
+ if (new_expires && !(timer_flags & TIMER_ABSTIME))
+ new_expires += now;
+
+ /* Set the new expiry time (might be 0) */
+ cpu_timer_setexpires(ctmr, new_expires);
/*
- * Install the new expiry time (or zero).
- * For a timer with no notification action, we don't actually
- * arm the timer (we'll just fake it for timer_gettime).
+ * Arm the timer if it is not disabled, the new expiry value has
+ * not yet expired and the timer requires signal delivery.
+ * SIGEV_NONE timers are never armed. In case the timer is not
+ * armed, enforce the reevaluation of the timer base so that the
+ * process wide cputime counter can be disabled eventually.
*/
- cpu_timer_setexpires(ctmr, new_expires);
- if (new_expires != 0 && val < new_expires) {
- arm_timer(timer, p);
+ if (likely(!sigev_none)) {
+ if (new_expires && now < new_expires)
+ arm_timer(timer, p);
+ else
+ trigger_base_recalc_expires(timer, p);
}
unlock_task_sighand(p, &flags);
+
+ posix_timer_set_common(timer, new);
+
/*
- * Install the new reload setting, and
- * set up the signal and overrun bookkeeping.
+ * If the new expiry time was already in the past the timer was not
+ * queued. Fire it immediately even if the thread never runs to
+ * accumulate more time on this clock.
*/
- timer->it_interval = timespec64_to_ktime(new->it_interval);
+ if (!sigev_none && new_expires && now >= new_expires)
+ cpu_timer_fire(timer);
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now)
+{
+ bool sigev_none = timer->it_sigev_notify == SIGEV_NONE;
+ u64 expires, iv = timer->it_interval;
/*
- * This acts as a modification timestamp for the timer,
- * so any automatic reload attempt will punt on seeing
- * that we have reset the timer manually.
+ * Make sure that interval timers are moved forward for the
+ * following cases:
+ * - SIGEV_NONE timers which are never armed
+ * - Timers which expired, but the signal has not yet been
+ * delivered
*/
- timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
- ~REQUEUE_PENDING;
- timer->it_overrun_last = 0;
- timer->it_overrun = -1;
-
- if (val >= new_expires) {
- if (new_expires != 0) {
- /*
- * The designated time already passed, so we notify
- * immediately, even if the thread never runs to
- * accumulate more time on this clock.
- */
- cpu_timer_fire(timer);
- }
+ if (iv && ((timer->it_requeue_pending & REQUEUE_PENDING) || sigev_none))
+ expires = bump_cpu_timer(timer, now);
+ else
+ expires = cpu_timer_getexpires(&timer->it.cpu);
+ /*
+ * Expired interval timers cannot have a remaining time <= 0.
+ * The kernel has to move them forward so that the next
+ * timer expiry is > @now.
+ */
+ if (now < expires) {
+ itp->it_value = ns_to_timespec64(expires - now);
+ } else {
/*
- * Make sure we don't keep around the process wide cputime
- * counter or the tick dependency if they are not necessary.
+ * A single shot SIGEV_NONE timer must return 0, when it is
+ * expired! Timers which have a real signal delivery mode
+ * must return a remaining time greater than 0 because the
+ * signal has not yet been delivered.
*/
- sighand = lock_task_sighand(p, &flags);
- if (!sighand)
- goto out;
-
- if (!cpu_timer_queued(ctmr))
- trigger_base_recalc_expires(timer, p);
-
- unlock_task_sighand(p, &flags);
+ if (!sigev_none)
+ itp->it_value.tv_nsec = 1;
}
- out:
- rcu_read_unlock();
- if (old)
- old->it_interval = ns_to_timespec64(old_incr);
-
- return ret;
}
static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
{
clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
- struct cpu_timer *ctmr = &timer->it.cpu;
- u64 now, expires = cpu_timer_getexpires(ctmr);
struct task_struct *p;
+ u64 now;
rcu_read_lock();
p = cpu_timer_task_rcu(timer);
- if (!p)
- goto out;
+ if (p && cpu_timer_getexpires(&timer->it.cpu)) {
+ itp->it_interval = ktime_to_timespec64(timer->it_interval);
- /*
- * Easy part: convert the reload time.
- */
- itp->it_interval = ktime_to_timespec64(timer->it_interval);
-
- if (!expires)
- goto out;
-
- /*
- * Sample the clock to take the difference with the expiry time.
- */
- if (CPUCLOCK_PERTHREAD(timer->it_clock))
- now = cpu_clock_sample(clkid, p);
- else
- now = cpu_clock_sample_group(clkid, p, false);
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
+ now = cpu_clock_sample(clkid, p);
+ else
+ now = cpu_clock_sample_group(clkid, p, false);
- if (now < expires) {
- itp->it_value = ns_to_timespec64(expires - now);
- } else {
- /*
- * The timer should have expired already, but the firing
- * hasn't taken place yet. Say it's just about to expire.
- */
- itp->it_value.tv_nsec = 1;
- itp->it_value.tv_sec = 0;
+ __posix_cpu_timer_get(timer, itp, now);
}
-out:
rcu_read_unlock();
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index b924f0f096fa..4576aaed13b2 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -277,10 +277,17 @@ void posixtimer_rearm(struct kernel_siginfo *info)
unlock_timer(timr, flags);
}
-int posix_timer_event(struct k_itimer *timr, int si_private)
+int posix_timer_queue_signal(struct k_itimer *timr)
{
+ int ret, si_private = 0;
enum pid_type type;
- int ret;
+
+ lockdep_assert_held(&timr->it_lock);
+
+ timr->it_active = 0;
+ if (timr->it_interval)
+ si_private = ++timr->it_requeue_pending;
+
/*
* FIXME: if ->sigq is queued we can race with
* dequeue_signal()->posixtimer_rearm().
@@ -309,19 +316,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
*/
static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
{
+ struct k_itimer *timr = container_of(timer, struct k_itimer, it.real.timer);
enum hrtimer_restart ret = HRTIMER_NORESTART;
- struct k_itimer *timr;
unsigned long flags;
- int si_private = 0;
- timr = container_of(timer, struct k_itimer, it.real.timer);
spin_lock_irqsave(&timr->it_lock, flags);
- timr->it_active = 0;
- if (timr->it_interval != 0)
- si_private = ++timr->it_requeue_pending;
-
- if (posix_timer_event(timr, si_private)) {
+ if (posix_timer_queue_signal(timr)) {
/*
* The signal was not queued due to SIG_IGN. As a
* consequence the timer is not going to be rearmed from
@@ -338,14 +339,14 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
* change to the signal handling code.
*
* For now let timers with an interval less than a
- * jiffie expire every jiffie and recheck for a
+ * jiffy expire every jiffy and recheck for a
* valid signal handler.
*
* This avoids interrupt starvation in case of a
* very small interval, which would expire the
* timer immediately again.
*
- * Moving now ahead of time by one jiffie tricks
+ * Moving now ahead of time by one jiffy tricks
* hrtimer_forward() to expire the timer later,
* while it still maintains the overrun accuracy
* for the price of a slight inconsistency in the
@@ -515,7 +516,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
spin_lock_irq(&current->sighand->siglock);
/* This makes the timer valid in the hash table */
WRITE_ONCE(new_timer->it_signal, current->signal);
- list_add(&new_timer->list, &current->signal->posix_timers);
+ hlist_add_head(&new_timer->list, &current->signal->posix_timers);
spin_unlock_irq(&current->sighand->siglock);
/*
* After unlocking sighand::siglock @new_timer is subject to
@@ -856,6 +857,23 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer,
return lock_timer(timer_id, flags);
}
+/*
+ * Set up the new interval and reset the signal delivery data
+ */
+void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting)
+{
+ if (new_setting->it_value.tv_sec || new_setting->it_value.tv_nsec)
+ timer->it_interval = timespec64_to_ktime(new_setting->it_interval);
+ else
+ timer->it_interval = 0;
+
+ /* Prevent reloading in case there is a signal pending */
+ timer->it_requeue_pending = (timer->it_requeue_pending + 2) & ~REQUEUE_PENDING;
+ /* Reset overrun accounting */
+ timer->it_overrun_last = 0;
+ timer->it_overrun = -1LL;
+}
+
/* Set a POSIX.1b interval timer. */
int common_timer_set(struct k_itimer *timr, int flags,
struct itimerspec64 *new_setting,
@@ -878,15 +896,12 @@ int common_timer_set(struct k_itimer *timr, int flags,
return TIMER_RETRY;
timr->it_active = 0;
- timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
- ~REQUEUE_PENDING;
- timr->it_overrun_last = 0;
+ posix_timer_set_common(timr, new_setting);
- /* Switch off the timer when it_value is zero */
+ /* Keep timer disarmed when it_value is zero */
if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
return 0;
- timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
expires = timespec64_to_ktime(new_setting->it_value);
if (flags & TIMER_ABSTIME)
expires = timens_ktime_to_host(timr->it_clock, expires);
@@ -904,7 +919,7 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags,
const struct k_clock *kc;
struct k_itimer *timr;
unsigned long flags;
- int error = 0;
+ int error;
if (!timespec64_valid(&new_spec64->it_interval) ||
!timespec64_valid(&new_spec64->it_value))
@@ -918,6 +933,9 @@ retry:
if (!timr)
return -EINVAL;
+ if (old_spec64)
+ old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
+
kc = timr->kclock;
if (WARN_ON_ONCE(!kc || !kc->timer_set))
error = -EINVAL;
@@ -1021,7 +1039,7 @@ retry_delete:
}
spin_lock(&current->sighand->siglock);
- list_del(&timer->list);
+ hlist_del(&timer->list);
spin_unlock(&current->sighand->siglock);
/*
* A concurrent lookup could check timer::it_signal lockless. It
@@ -1071,7 +1089,7 @@ retry_delete:
goto retry_delete;
}
- list_del(&timer->list);
+ hlist_del(&timer->list);
/*
* Setting timer::it_signal to NULL is technically not required
@@ -1092,22 +1110,19 @@ retry_delete:
*/
void exit_itimers(struct task_struct *tsk)
{
- struct list_head timers;
- struct k_itimer *tmr;
+ struct hlist_head timers;
- if (list_empty(&tsk->signal->posix_timers))
+ if (hlist_empty(&tsk->signal->posix_timers))
return;
/* Protect against concurrent read via /proc/$PID/timers */
spin_lock_irq(&tsk->sighand->siglock);
- list_replace_init(&tsk->signal->posix_timers, &timers);
+ hlist_move_list(&tsk->signal->posix_timers, &timers);
spin_unlock_irq(&tsk->sighand->siglock);
/* The timers are not longer accessible via tsk::signal */
- while (!list_empty(&timers)) {
- tmr = list_first_entry(&timers, struct k_itimer, list);
- itimer_delete(tmr);
- }
+ while (!hlist_empty(&timers))
+ itimer_delete(hlist_entry(timers.first, struct k_itimer, list));
}
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index f32a2ebba9b8..4784ea65f685 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -36,10 +36,11 @@ extern const struct k_clock clock_process;
extern const struct k_clock clock_thread;
extern const struct k_clock alarm_clock;
-int posix_timer_event(struct k_itimer *timr, int si_private);
+int posix_timer_queue_signal(struct k_itimer *timr);
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting);
int common_timer_set(struct k_itimer *timr, int flags,
struct itimerspec64 *new_setting,
struct itimerspec64 *old_setting);
+void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting);
int common_timer_del(struct k_itimer *timer);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b4843099a8da..ed58eebb4e8f 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -1141,7 +1141,6 @@ void tick_broadcast_switch_to_oneshot(void)
#ifdef CONFIG_HOTPLUG_CPU
void hotplug_cpu__broadcast_tick_pull(int deadcpu)
{
- struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *bc;
unsigned long flags;
@@ -1167,6 +1166,8 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
* device to avoid the starvation.
*/
if (tick_check_broadcast_expired()) {
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask);
tick_program_event(td->evtdev->next_event, 1);
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2fa87dcfeda9..7e6f409bf311 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2553,6 +2553,7 @@ int do_adjtimex(struct __kernel_timex *txc)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct audit_ntp_data ad;
+ bool offset_set = false;
bool clock_set = false;
struct timespec64 ts;
unsigned long flags;
@@ -2575,6 +2576,7 @@ int do_adjtimex(struct __kernel_timex *txc)
if (ret)
return ret;
+ offset_set = delta.tv_sec != 0;
audit_tk_injoffset(delta);
}
@@ -2606,9 +2608,9 @@ int do_adjtimex(struct __kernel_timex *txc)
clock_set |= timekeeping_advance(TK_ADV_FREQ);
if (clock_set)
- clock_was_set(CLOCK_REALTIME);
+ clock_was_set(CLOCK_SET_WALL);
- ntp_notify_cmos_timer();
+ ntp_notify_cmos_timer(offset_set);
return ret;
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 64b0d8a0aa0f..0fc9d066a7be 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -365,7 +365,7 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
rem = j % HZ;
/*
- * If the target jiffie is just after a whole second (which can happen
+ * If the target jiffy is just after a whole second (which can happen
* due to delays of the timer irq, long irq off times etc etc) then
* we should round down to the whole second, not up. Use 1/4th second
* as cutoff for this rounding as an extreme upper bound for this.
@@ -672,7 +672,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
* Set the next expiry time and kick the CPU so it
* can reevaluate the wheel:
*/
- base->next_expiry = bucket_expiry;
+ WRITE_ONCE(base->next_expiry, bucket_expiry);
base->timers_pending = true;
base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
@@ -1561,6 +1561,8 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
* the waiter to acquire the lock and make progress.
*/
static void timer_sync_wait_running(struct timer_base *base)
+ __releases(&base->lock) __releases(&base->expiry_lock)
+ __acquires(&base->expiry_lock) __acquires(&base->lock)
{
if (atomic_read(&base->timer_waiters)) {
raw_spin_unlock_irq(&base->lock);
@@ -1898,7 +1900,7 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
*
* Store next expiry time in base->next_expiry.
*/
-static void next_expiry_recalc(struct timer_base *base)
+static void timer_recalc_next_expiry(struct timer_base *base)
{
unsigned long clk, next, adj;
unsigned lvl, offset = 0;
@@ -1928,7 +1930,7 @@ static void next_expiry_recalc(struct timer_base *base)
* bits are zero, we look at the next level as is. If not we
* need to advance it by one because that's going to be the
* next expiring bucket in that level. base->clk is the next
- * expiring jiffie. So in case of:
+ * expiring jiffy. So in case of:
*
* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
* 0 0 0 0 0 0
@@ -1964,7 +1966,7 @@ static void next_expiry_recalc(struct timer_base *base)
clk += adj;
}
- base->next_expiry = next;
+ WRITE_ONCE(base->next_expiry, next);
base->next_expiry_recalc = false;
base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
}
@@ -1993,7 +1995,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
return basem;
/*
- * Round up to the next jiffie. High resolution timers are
+ * Round up to the next jiffy. High resolution timers are
* off, so the hrtimers are expired in the tick and we need to
* make sure that this tick really expires the timer to avoid
* a ping pong of the nohz stop code.
@@ -2007,7 +2009,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
unsigned long basej)
{
if (base->next_expiry_recalc)
- next_expiry_recalc(base);
+ timer_recalc_next_expiry(base);
/*
* Move next_expiry for the empty base into the future to prevent an
@@ -2018,7 +2020,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
* easy comparable to find out which base holds the first pending timer.
*/
if (!base->timers_pending)
- base->next_expiry = basej + NEXT_TIMER_MAX_DELTA;
+ WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);
return base->next_expiry;
}
@@ -2252,7 +2254,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
base_global, &tevt);
/*
- * If the next event is only one jiffie ahead there is no need to call
+ * If the next event is only one jiffy ahead there is no need to call
* timer migration hierarchy related functions. The value for the next
* global timer in @tevt struct equals then KTIME_MAX. This is also
* true, when the timer base is idle.
@@ -2411,7 +2413,7 @@ static inline void __run_timers(struct timer_base *base)
* jiffies to avoid endless requeuing to current jiffies.
*/
base->clk++;
- next_expiry_recalc(base);
+ timer_recalc_next_expiry(base);
while (levels--)
expire_timers(base, heads + levels);
@@ -2440,7 +2442,7 @@ static void run_timer_base(int index)
/*
* This function runs timers and the timer-tq in bottom half context.
*/
-static __latent_entropy void run_timer_softirq(struct softirq_action *h)
+static __latent_entropy void run_timer_softirq(void)
{
run_timer_base(BASE_LOCAL);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
@@ -2462,8 +2464,40 @@ static void run_local_timers(void)
hrtimer_run_queues();
for (int i = 0; i < NR_BASES; i++, base++) {
- /* Raise the softirq only if required. */
- if (time_after_eq(jiffies, base->next_expiry) ||
+ /*
+ * Raise the softirq only if required.
+ *
+ * timer_base::next_expiry can be written by a remote CPU while
+ * holding the lock. If this write happens at the same time than
+ * the lockless local read, sanity checker could complain about
+ * data corruption.
+ *
+ * There are two possible situations where
+ * timer_base::next_expiry is written by a remote CPU:
+ *
+ * 1. Remote CPU expires global timers of this CPU and updates
+ * timer_base::next_expiry of BASE_GLOBAL afterwards in
+ * next_timer_interrupt() or timer_recalc_next_expiry(). The
+ * worst outcome is a superfluous raise of the timer softirq
+ * when the not yet updated value is read.
+ *
+ * 2. A new first pinned timer is enqueued by a remote CPU
+ * and therefore timer_base::next_expiry of BASE_LOCAL is
+ * updated. When this update is missed, this isn't a
+ * problem, as an IPI is executed nevertheless when the CPU
+ * was idle before. When the CPU wasn't idle but the update
+ * is missed, then the timer would expire one jiffy late -
+ * bad luck.
+ *
+ * Those unlikely corner cases where the worst outcome is only a
+ * one jiffy delay or a superfluous raise of the softirq are
+ * not that expensive as doing the check always while holding
+ * the lock.
+ *
+ * Possible remote writers are using WRITE_ONCE(). Local reader
+ * uses therefore READ_ONCE().
+ */
+ if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) ||
(i == BASE_DEF && tmigr_requires_handle_remote())) {
raise_softirq(TIMER_SOFTIRQ);
return;
@@ -2730,7 +2764,7 @@ void __init init_timers(void)
*/
void msleep(unsigned int msecs)
{
- unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+ unsigned long timeout = msecs_to_jiffies(msecs);
while (timeout)
timeout = schedule_timeout_uninterruptible(timeout);
@@ -2744,7 +2778,7 @@ EXPORT_SYMBOL(msleep);
*/
unsigned long msleep_interruptible(unsigned int msecs)
{
- unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+ unsigned long timeout = msecs_to_jiffies(msecs);
while (timeout && !signal_pending(current))
timeout = schedule_timeout_interruptible(timeout);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index fc205ad167a9..d7d4fb403f6f 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -902,7 +902,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
i = *idx ? : task->curr_ret_stack;
while (i > 0) {
- ret_stack = get_ret_stack(current, i, &i);
+ ret_stack = get_ret_stack(task, i, &i);
if (!ret_stack)
break;
/*
@@ -1206,18 +1206,24 @@ static void init_task_vars(int idx)
read_unlock(&tasklist_lock);
}
-static void ftrace_graph_enable_direct(bool enable_branch)
+static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *gops)
{
trace_func_graph_ent_t func = NULL;
trace_func_graph_ret_t retfunc = NULL;
int i;
- for_each_set_bit(i, &fgraph_array_bitmask,
- sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
- func = fgraph_array[i]->entryfunc;
- retfunc = fgraph_array[i]->retfunc;
- fgraph_direct_gops = fgraph_array[i];
- }
+ if (gops) {
+ func = gops->entryfunc;
+ retfunc = gops->retfunc;
+ fgraph_direct_gops = gops;
+ } else {
+ for_each_set_bit(i, &fgraph_array_bitmask,
+ sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
+ func = fgraph_array[i]->entryfunc;
+ retfunc = fgraph_array[i]->retfunc;
+ fgraph_direct_gops = fgraph_array[i];
+ }
+ }
if (WARN_ON_ONCE(!func))
return;
@@ -1256,8 +1262,6 @@ int register_ftrace_graph(struct fgraph_ops *gops)
ret = -ENOSPC;
goto out;
}
-
- fgraph_array[i] = gops;
gops->idx = i;
ftrace_graph_active++;
@@ -1266,7 +1270,7 @@ int register_ftrace_graph(struct fgraph_ops *gops)
ftrace_graph_disable_direct(true);
if (ftrace_graph_active == 1) {
- ftrace_graph_enable_direct(false);
+ ftrace_graph_enable_direct(false, gops);
register_pm_notifier(&ftrace_suspend_notifier);
ret = start_graph_tracing();
if (ret)
@@ -1281,14 +1285,15 @@ int register_ftrace_graph(struct fgraph_ops *gops)
} else {
init_task_vars(gops->idx);
}
-
/* Always save the function, and reset at unregistering */
gops->saved_func = gops->entryfunc;
ret = ftrace_startup_subops(&graph_ops, &gops->ops, command);
+ if (!ret)
+ fgraph_array[i] = gops;
+
error:
if (ret) {
- fgraph_array[i] = &fgraph_stub;
ftrace_graph_active--;
gops->saved_func = NULL;
fgraph_lru_release_index(i);
@@ -1324,7 +1329,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
ftrace_shutdown_subops(&graph_ops, &gops->ops, command);
if (ftrace_graph_active == 1)
- ftrace_graph_enable_direct(true);
+ ftrace_graph_enable_direct(true, NULL);
else if (!ftrace_graph_active)
ftrace_graph_disable_direct(false);
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index cb0871fbdb07..314ffc143039 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on");
static struct completion done;
-#define MIN(x, y) ((x) < (y) ? (x) : (y))
-
static void busy_wait(ulong time)
{
u64 start, end;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 28853966aa9a..cebd879a30cb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -693,18 +693,6 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
}
/**
- * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
- * @buffer: The ring_buffer to get the number of pages from
- * @cpu: The cpu of the ring_buffer to get the number of pages from
- *
- * Returns the number of pages used by a per_cpu buffer of the ring buffer.
- */
-size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
-{
- return buffer->buffers[cpu]->nr_pages;
-}
-
-/**
* ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 10cd38bce2f1..c3b2c7dfadef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2226,10 +2226,6 @@ static __init int init_trace_selftests(void)
}
core_initcall(init_trace_selftests);
#else
-static inline int run_tracer_selftest(struct tracer *type)
-{
- return 0;
-}
static inline int do_run_tracer_selftest(struct tracer *type)
{
return 0;
@@ -3958,6 +3954,8 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
break;
entries++;
ring_buffer_iter_advance(buf_iter);
+ /* This could be a big loop */
+ cond_resched();
}
per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
@@ -7956,7 +7954,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
trace_access_unlock(iter->cpu_file);
if (ret < 0) {
- if (trace_empty(iter)) {
+ if (trace_empty(iter) && !iter->closed) {
if ((filp->f_flags & O_NONBLOCK))
return -EAGAIN;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8783bebd0562..bd3e3069300e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1634,6 +1634,29 @@ static inline void *event_file_data(struct file *filp)
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
+/*
+ * When the trace_event_file is the filp->i_private pointer,
+ * it must be taken under the event_mutex lock, and then checked
+ * if the EVENT_FILE_FL_FREED flag is set. If it is, then the
+ * data pointed to by the trace_event_file can not be trusted.
+ *
+ * Use the event_file_file() to access the trace_event_file from
+ * the filp the first time under the event_mutex and check for
+ * NULL. If it is needed to be retrieved again and the event_mutex
+ * is still held, then the event_file_data() can be used and it
+ * is guaranteed to be valid.
+ */
+static inline struct trace_event_file *event_file_file(struct file *filp)
+{
+ struct trace_event_file *file;
+
+ lockdep_assert_held(&event_mutex);
+ file = READ_ONCE(file_inode(filp)->i_private);
+ if (!file || file->flags & EVENT_FILE_FL_FREED)
+ return NULL;
+ return file;
+}
+
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
extern const struct file_operations event_hist_debug_fops;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6ef29eba90ce..7266ec2a4eea 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -992,18 +992,18 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
void event_file_get(struct trace_event_file *file)
{
- atomic_inc(&file->ref);
+ refcount_inc(&file->ref);
}
void event_file_put(struct trace_event_file *file)
{
- if (WARN_ON_ONCE(!atomic_read(&file->ref))) {
+ if (WARN_ON_ONCE(!refcount_read(&file->ref))) {
if (file->flags & EVENT_FILE_FL_FREED)
kmem_cache_free(file_cachep, file);
return;
}
- if (atomic_dec_and_test(&file->ref)) {
+ if (refcount_dec_and_test(&file->ref)) {
/* Count should only go to zero when it is freed */
if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
return;
@@ -1386,12 +1386,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
char buf[4] = "0";
mutex_lock(&event_mutex);
- file = event_file_data(filp);
+ file = event_file_file(filp);
if (likely(file))
flags = file->flags;
mutex_unlock(&event_mutex);
- if (!file || flags & EVENT_FILE_FL_FREED)
+ if (!file)
return -ENODEV;
if (flags & EVENT_FILE_FL_ENABLED &&
@@ -1424,8 +1424,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
case 1:
ret = -ENODEV;
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) {
+ file = event_file_file(filp);
+ if (likely(file)) {
ret = tracing_update_buffers(file->tr);
if (ret < 0) {
mutex_unlock(&event_mutex);
@@ -1540,7 +1540,8 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_event_call *call = event_file_data(m->private);
+ struct trace_event_file *file = event_file_data(m->private);
+ struct trace_event_call *call = file->event_call;
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
@@ -1572,7 +1573,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct trace_event_call *call = event_file_data(m->private);
+ struct trace_event_file *file = event_file_data(m->private);
+ struct trace_event_call *call = file->event_call;
struct ftrace_event_field *field;
const char *array_descriptor;
@@ -1627,12 +1629,14 @@ static int f_show(struct seq_file *m, void *v)
static void *f_start(struct seq_file *m, loff_t *pos)
{
+ struct trace_event_file *file;
void *p = (void *)FORMAT_HEADER;
loff_t l = 0;
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
- if (!event_file_data(m->private))
+ file = event_file_file(m->private);
+ if (!file)
return ERR_PTR(-ENODEV);
while (l < *pos && p)
@@ -1706,8 +1710,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file && !(file->flags & EVENT_FILE_FL_FREED))
+ file = event_file_file(filp);
+ if (file)
print_event_filter(file, s);
mutex_unlock(&event_mutex);
@@ -1736,9 +1740,13 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
return PTR_ERR(buf);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file)
- err = apply_event_filter(file, buf);
+ file = event_file_file(filp);
+ if (file) {
+ if (file->flags & EVENT_FILE_FL_FREED)
+ err = -ENODEV;
+ else
+ err = apply_event_filter(file, buf);
+ }
mutex_unlock(&event_mutex);
kfree(buf);
@@ -2485,7 +2493,6 @@ static int event_callback(const char *name, umode_t *mode, void **data,
if (strcmp(name, "format") == 0) {
*mode = TRACE_MODE_READ;
*fops = &ftrace_event_format_fops;
- *data = call;
return 1;
}
@@ -2996,7 +3003,7 @@ trace_create_new_event(struct trace_event_call *call,
atomic_set(&file->tm_ref, 0);
INIT_LIST_HEAD(&file->triggers);
list_add(&file->list, &tr->events);
- event_file_get(file);
+ refcount_set(&file->ref, 1);
return file;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6ece1308d36a..5f9119eb7c67 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5601,7 +5601,7 @@ static int hist_show(struct seq_file *m, void *v)
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file)) {
ret = -ENODEV;
goto out_unlock;
@@ -5880,7 +5880,7 @@ static int hist_debug_show(struct seq_file *m, void *v)
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file)) {
ret = -ENODEV;
goto out_unlock;
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index 8650562bdaa9..a8f076809db4 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -299,7 +299,7 @@ event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt,
strim(buf);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
+ file = event_file_file(filp);
if (file) {
call = file->event_call;
size = parse_entry(buf, call, &entry);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4bec043c8690..a5e3d6acf1e1 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -159,7 +159,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos)
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file))
return ERR_PTR(-ENODEV);
@@ -213,7 +213,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
mutex_lock(&event_mutex);
- if (unlikely(!event_file_data(file))) {
+ if (unlikely(!event_file_file(file))) {
mutex_unlock(&event_mutex);
return -ENODEV;
}
@@ -293,7 +293,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
strim(buf);
mutex_lock(&event_mutex);
- event_file = event_file_data(file);
+ event_file = event_file_file(file);
if (unlikely(!event_file)) {
mutex_unlock(&event_mutex);
kfree(buf);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 66a871553d4a..1439064f65d6 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -228,6 +228,11 @@ static inline struct osnoise_variables *this_cpu_osn_var(void)
return this_cpu_ptr(&per_cpu_osnoise_var);
}
+/*
+ * Protect the interface.
+ */
+static struct mutex interface_lock;
+
#ifdef CONFIG_TIMERLAT_TRACER
/*
* Runtime information for the timer mode.
@@ -259,14 +264,20 @@ static inline void tlat_var_reset(void)
{
struct timerlat_variables *tlat_var;
int cpu;
+
+ /* Synchronize with the timerlat interfaces */
+ mutex_lock(&interface_lock);
/*
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
*/
for_each_cpu(cpu, cpu_online_mask) {
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
+ if (tlat_var->kthread)
+ hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
}
+ mutex_unlock(&interface_lock);
}
#else /* CONFIG_TIMERLAT_TRACER */
#define tlat_var_reset() do {} while (0)
@@ -332,11 +343,6 @@ struct timerlat_sample {
#endif
/*
- * Protect the interface.
- */
-static struct mutex interface_lock;
-
-/*
* Tracer data.
*/
static struct osnoise_data {
@@ -1535,7 +1541,7 @@ static int run_osnoise(void)
* This will eventually cause unwarranted noise as PREEMPT_RCU
* will force preemption as the means of ending the current
* grace period. We avoid this problem by calling
- * rcu_momentary_dyntick_idle(), which performs a zero duration
+ * rcu_momentary_eqs(), which performs a zero duration
* EQS allowing PREEMPT_RCU to end the current grace period.
* This call shouldn't be wrapped inside an RCU critical
* section.
@@ -1547,7 +1553,7 @@ static int run_osnoise(void)
if (!disable_irq)
local_irq_disable();
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
if (!disable_irq)
local_irq_enable();
@@ -1612,6 +1618,7 @@ out:
static struct cpumask osnoise_cpumask;
static struct cpumask save_cpumask;
+static struct cpumask kthread_cpumask;
/*
* osnoise_sleep - sleep until the next period
@@ -1675,6 +1682,7 @@ static inline int osnoise_migration_pending(void)
*/
mutex_lock(&interface_lock);
this_cpu_osn_var()->kthread = NULL;
+ cpumask_clear_cpu(smp_processor_id(), &kthread_cpumask);
mutex_unlock(&interface_lock);
return 1;
@@ -1945,11 +1953,16 @@ static void stop_kthread(unsigned int cpu)
{
struct task_struct *kthread;
+ mutex_lock(&interface_lock);
kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
if (kthread) {
- if (test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+ mutex_unlock(&interface_lock);
+
+ if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) &&
+ !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) {
kthread_stop(kthread);
- } else {
+ } else if (!WARN_ON(test_bit(OSN_WORKLOAD, &osnoise_options))) {
/*
* This is a user thread waiting on the timerlat_fd. We need
* to close all users, and the best way to guarantee this is
@@ -1958,8 +1971,8 @@ static void stop_kthread(unsigned int cpu)
kill_pid(kthread->thread_pid, SIGKILL, 1);
put_task_struct(kthread);
}
- per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
} else {
+ mutex_unlock(&interface_lock);
/* if no workload, just return */
if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
/*
@@ -1967,7 +1980,6 @@ static void stop_kthread(unsigned int cpu)
*/
per_cpu(per_cpu_osnoise_var, cpu).sampling = false;
barrier();
- return;
}
}
}
@@ -1982,12 +1994,8 @@ static void stop_per_cpu_kthreads(void)
{
int cpu;
- cpus_read_lock();
-
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
stop_kthread(cpu);
-
- cpus_read_unlock();
}
/*
@@ -2021,6 +2029,7 @@ static int start_kthread(unsigned int cpu)
}
per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
+ cpumask_set_cpu(cpu, &kthread_cpumask);
return 0;
}
@@ -2048,8 +2057,16 @@ static int start_per_cpu_kthreads(void)
*/
cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) {
+ struct task_struct *kthread;
+
+ kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
+ if (!WARN_ON(!kthread))
+ kthread_stop(kthread);
+ }
per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+ }
for_each_cpu(cpu, current_mask) {
retval = start_kthread(cpu);
@@ -2579,7 +2596,8 @@ static int timerlat_fd_release(struct inode *inode, struct file *file)
osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
- hrtimer_cancel(&tlat_var->timer);
+ if (tlat_var->kthread)
+ hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
osn_var->sampling = 0;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 97f1e4bc47dc..c4ad7cd7e778 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -942,7 +942,7 @@ static __init int test_graph_storage_multi(void)
{
struct fgraph_fixture *fixture;
bool printed = false;
- int i, ret;
+ int i, j, ret;
pr_cont("PASSED\n");
pr_info("Testing multiple fgraph storage on a function: ");
@@ -953,22 +953,35 @@ static __init int test_graph_storage_multi(void)
if (ret && ret != -ENODEV) {
pr_cont("*Could not set filter* ");
printed = true;
- goto out;
+ goto out2;
}
+ }
+ for (j = 0; j < ARRAY_SIZE(store_bytes); j++) {
+ fixture = &store_bytes[j];
ret = register_ftrace_graph(&fixture->gops);
if (ret) {
pr_warn("Failed to init store_bytes fgraph tracing\n");
printed = true;
- goto out;
+ goto out1;
}
}
DYN_FTRACE_TEST_NAME();
-out:
+out1:
+ while (--j >= 0) {
+ fixture = &store_bytes[j];
+ unregister_ftrace_graph(&fixture->gops);
+
+ if (fixture->error_str && !printed) {
+ pr_cont("*** %s ***", fixture->error_str);
+ printed = true;
+ }
+ }
+out2:
while (--i >= 0) {
fixture = &store_bytes[i];
- unregister_ftrace_graph(&fixture->gops);
+ ftrace_free_filter(&fixture->gops.ops);
if (fixture->error_str && !printed) {
pr_cont("*** %s ***", fixture->error_str);
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index a4dcf0f24352..3a56e7c8aa4f 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -454,7 +454,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map)
struct tracing_map_elt *elt = NULL;
int idx;
- idx = atomic_inc_return(&map->next_elt);
+ idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts);
if (idx < map->max_elts) {
elt = *(TRACING_MAP_ELT(map->elts, idx));
if (map->ops && map->ops->elt_init)
@@ -699,7 +699,7 @@ void tracing_map_clear(struct tracing_map *map)
{
unsigned int i;
- atomic_set(&map->next_elt, -1);
+ atomic_set(&map->next_elt, 0);
atomic64_set(&map->hits, 0);
atomic64_set(&map->drops, 0);
@@ -783,7 +783,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits,
map->map_bits = map_bits;
map->max_elts = (1 << map_bits);
- atomic_set(&map->next_elt, -1);
+ atomic_set(&map->next_elt, 0);
map->map_size = (1 << (map_bits + 1));
map->ops = ops;
diff --git a/kernel/user.c b/kernel/user.c
index aa1162deafe4..f46b1d41163b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -36,33 +36,33 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc);
*/
struct user_namespace init_user_ns = {
.uid_map = {
- .nr_extents = 1,
{
.extent[0] = {
.first = 0,
.lower_first = 0,
.count = 4294967295U,
},
+ .nr_extents = 1,
},
},
.gid_map = {
- .nr_extents = 1,
{
.extent[0] = {
.first = 0,
.lower_first = 0,
.count = 4294967295U,
},
+ .nr_extents = 1,
},
},
.projid_map = {
- .nr_extents = 1,
{
.extent[0] = {
.first = 0,
.lower_first = 0,
.count = 4294967295U,
},
+ .nr_extents = 1,
},
},
.ns.count = REFCOUNT_INIT(3),
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1745ca788ede..9949ffad8df0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -364,7 +364,8 @@ struct workqueue_struct {
#ifdef CONFIG_LOCKDEP
char *lock_name;
struct lock_class_key key;
- struct lockdep_map lockdep_map;
+ struct lockdep_map __lockdep_map;
+ struct lockdep_map *lockdep_map;
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
@@ -377,7 +378,7 @@ struct workqueue_struct {
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
- struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
+ struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */
struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
};
@@ -476,16 +477,13 @@ static bool wq_debug_force_rr_cpu = false;
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
/* to raise softirq for the BH worker pools on other CPUs */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS],
- bh_pool_irq_works);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works);
/* the BH worker pools */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
- bh_worker_pools);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools);
/* the per-cpu worker pools */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
- cpu_worker_pools);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
@@ -897,7 +895,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits)
{
- return (v >> shift) & ((1 << bits) - 1);
+ return (v >> shift) & ((1U << bits) - 1);
}
static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data)
@@ -2709,7 +2707,6 @@ static void detach_worker(struct worker *worker)
unbind_worker(worker);
list_del(&worker->node);
- worker->pool = NULL;
}
/**
@@ -2729,6 +2726,7 @@ static void worker_detach_from_pool(struct worker *worker)
mutex_lock(&wq_pool_attach_mutex);
detach_worker(worker);
+ worker->pool = NULL;
mutex_unlock(&wq_pool_attach_mutex);
/* clear leftover flags without pool->lock after it is detached */
@@ -3203,7 +3201,7 @@ __acquires(&pool->lock)
lockdep_start_depth = lockdep_depth(current);
/* see drain_dead_softirq_workfn() */
if (!bh_draining)
- lock_map_acquire(&pwq->wq->lockdep_map);
+ lock_map_acquire(pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
/*
* Strictly speaking we should mark the invariant state without holding
@@ -3237,7 +3235,7 @@ __acquires(&pool->lock)
pwq->stats[PWQ_STAT_COMPLETED]++;
lock_map_release(&lockdep_map);
if (!bh_draining)
- lock_map_release(&pwq->wq->lockdep_map);
+ lock_map_release(pwq->wq->lockdep_map);
if (unlikely((worker->task && in_atomic()) ||
lockdep_depth(current) != lockdep_start_depth ||
@@ -3349,9 +3347,12 @@ woke_up:
if (unlikely(worker->flags & WORKER_DIE)) {
raw_spin_unlock_irq(&pool->lock);
set_pf_worker(false);
-
+ /*
+ * The worker is dead and PF_WQ_WORKER is cleared, worker->pool
+ * shouldn't be accessed, reset it to NULL in case otherwise.
+ */
+ worker->pool = NULL;
ida_free(&pool->worker_ida, worker->id);
- WARN_ON_ONCE(!list_empty(&worker->entry));
return 0;
}
@@ -3870,11 +3871,14 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
static void touch_wq_lockdep_map(struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
+ if (unlikely(!wq->lockdep_map))
+ return;
+
if (wq->flags & WQ_BH)
local_bh_disable();
- lock_map_acquire(&wq->lockdep_map);
- lock_map_release(&wq->lockdep_map);
+ lock_map_acquire(wq->lockdep_map);
+ lock_map_release(wq->lockdep_map);
if (wq->flags & WQ_BH)
local_bh_enable();
@@ -3908,7 +3912,7 @@ void __flush_workqueue(struct workqueue_struct *wq)
struct wq_flusher this_flusher = {
.list = LIST_HEAD_INIT(this_flusher.list),
.flush_color = -1,
- .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
+ .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)),
};
int next_color;
@@ -4167,7 +4171,6 @@ already_gone:
static bool __flush_work(struct work_struct *work, bool from_cancel)
{
struct wq_barrier barr;
- unsigned long data;
if (WARN_ON(!wq_online))
return false;
@@ -4185,29 +4188,35 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
* was queued on a BH workqueue, we also know that it was running in the
* BH context and thus can be busy-waited.
*/
- data = *work_data_bits(work);
- if (from_cancel &&
- !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) {
- /*
- * On RT, prevent a live lock when %current preempted soft
- * interrupt processing or prevents ksoftirqd from running by
- * keeping flipping BH. If the BH work item runs on a different
- * CPU then this has no effect other than doing the BH
- * disable/enable dance for nothing. This is copied from
- * kernel/softirq.c::tasklet_unlock_spin_wait().
- */
- while (!try_wait_for_completion(&barr.done)) {
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- local_bh_disable();
- local_bh_enable();
- } else {
- cpu_relax();
+ if (from_cancel) {
+ unsigned long data = *work_data_bits(work);
+
+ if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) &&
+ (data & WORK_OFFQ_BH)) {
+ /*
+ * On RT, prevent a live lock when %current preempted
+ * soft interrupt processing or prevents ksoftirqd from
+ * running by keeping flipping BH. If the BH work item
+ * runs on a different CPU then this has no effect other
+ * than doing the BH disable/enable dance for nothing.
+ * This is copied from
+ * kernel/softirq.c::tasklet_unlock_spin_wait().
+ */
+ while (!try_wait_for_completion(&barr.done)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ local_bh_disable();
+ local_bh_enable();
+ } else {
+ cpu_relax();
+ }
}
+ goto out_destroy;
}
- } else {
- wait_for_completion(&barr.done);
}
+ wait_for_completion(&barr.done);
+
+out_destroy:
destroy_work_on_stack(&barr.work);
return true;
}
@@ -4768,16 +4777,23 @@ static void wq_init_lockdep(struct workqueue_struct *wq)
lock_name = wq->name;
wq->lock_name = lock_name;
- lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
+ wq->lockdep_map = &wq->__lockdep_map;
+ lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0);
}
static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
+ if (wq->lockdep_map != &wq->__lockdep_map)
+ return;
+
lockdep_unregister_key(&wq->key);
}
static void wq_free_lockdep(struct workqueue_struct *wq)
{
+ if (wq->lockdep_map != &wq->__lockdep_map)
+ return;
+
if (wq->lock_name != wq->name)
kfree(wq->lock_name);
}
@@ -5611,12 +5627,10 @@ static void wq_adjust_max_active(struct workqueue_struct *wq)
} while (activated);
}
-__printf(1, 4)
-struct workqueue_struct *alloc_workqueue(const char *fmt,
- unsigned int flags,
- int max_active, ...)
+static struct workqueue_struct *__alloc_workqueue(const char *fmt,
+ unsigned int flags,
+ int max_active, va_list args)
{
- va_list args;
struct workqueue_struct *wq;
size_t wq_size;
int name_len;
@@ -5648,9 +5662,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
goto err_free_wq;
}
- va_start(args, max_active);
name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
- va_end(args);
if (name_len >= WQ_NAME_LEN)
pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
@@ -5680,12 +5692,11 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
INIT_LIST_HEAD(&wq->flusher_overflow);
INIT_LIST_HEAD(&wq->maydays);
- wq_init_lockdep(wq);
INIT_LIST_HEAD(&wq->list);
if (flags & WQ_UNBOUND) {
if (alloc_node_nr_active(wq->node_nr_active) < 0)
- goto err_unreg_lockdep;
+ goto err_free_wq;
}
/*
@@ -5724,9 +5735,6 @@ err_unlock_free_node_nr_active:
kthread_flush_worker(pwq_release_worker);
free_node_nr_active(wq->node_nr_active);
}
-err_unreg_lockdep:
- wq_unregister_lockdep(wq);
- wq_free_lockdep(wq);
err_free_wq:
free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
@@ -5737,8 +5745,49 @@ err_destroy:
destroy_workqueue(wq);
return NULL;
}
+
+__printf(1, 4)
+struct workqueue_struct *alloc_workqueue(const char *fmt,
+ unsigned int flags,
+ int max_active, ...)
+{
+ struct workqueue_struct *wq;
+ va_list args;
+
+ va_start(args, max_active);
+ wq = __alloc_workqueue(fmt, flags, max_active, args);
+ va_end(args);
+ if (!wq)
+ return NULL;
+
+ wq_init_lockdep(wq);
+
+ return wq;
+}
EXPORT_SYMBOL_GPL(alloc_workqueue);
+#ifdef CONFIG_LOCKDEP
+__printf(1, 5)
+struct workqueue_struct *
+alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags,
+ int max_active, struct lockdep_map *lockdep_map, ...)
+{
+ struct workqueue_struct *wq;
+ va_list args;
+
+ va_start(args, lockdep_map);
+ wq = __alloc_workqueue(fmt, flags, max_active, args);
+ va_end(args);
+ if (!wq)
+ return NULL;
+
+ wq->lockdep_map = lockdep_map;
+
+ return wq;
+}
+EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map);
+#endif
+
static bool pwq_busy(struct pool_workqueue *pwq)
{
int i;
@@ -7398,6 +7447,9 @@ static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+static unsigned int wq_panic_on_stall;
+module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
+
/*
* Show workers that might prevent the processing of pending work items.
* The only candidates are CPU-bound workers in the running state.
@@ -7449,6 +7501,16 @@ static void show_cpu_pools_hogs(void)
rcu_read_unlock();
}
+static void panic_on_wq_watchdog(void)
+{
+ static unsigned int wq_stall;
+
+ if (wq_panic_on_stall) {
+ wq_stall++;
+ BUG_ON(wq_stall >= wq_panic_on_stall);
+ }
+}
+
static void wq_watchdog_reset_touched(void)
{
int cpu;
@@ -7521,6 +7583,9 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
if (cpu_pool_stall)
show_cpu_pools_hogs();
+ if (lockup_detected)
+ panic_on_wq_watchdog();
+
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);
}