summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig.binfmt6
-rw-r--r--fs/afs/cmservice.c2
-rw-r--r--fs/afs/dir.c108
-rw-r--r--fs/afs/dir_silly.c22
-rw-r--r--fs/afs/fs_probe.c23
-rw-r--r--fs/afs/fsclient.c35
-rw-r--r--fs/afs/internal.h4
-rw-r--r--fs/afs/rotate.c6
-rw-r--r--fs/afs/server.c7
-rw-r--r--fs/afs/vl_probe.c18
-rw-r--r--fs/afs/vl_rotate.c4
-rw-r--r--fs/afs/volume.c8
-rw-r--r--fs/afs/yfsclient.c34
-rw-r--r--fs/binfmt_elf.c147
-rw-r--r--fs/block_dev.c11
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/block-group.c21
-rw-r--r--fs/btrfs/discard.h2
-rw-r--r--fs/btrfs/disk-io.c36
-rw-r--r--fs/btrfs/file.c15
-rw-r--r--fs/btrfs/reflink.c1
-rw-r--r--fs/btrfs/relocation.c24
-rw-r--r--fs/btrfs/space-info.c20
-rw-r--r--fs/btrfs/transaction.c13
-rw-r--r--fs/btrfs/tree-log.c136
-rw-r--r--fs/buffer.c13
-rw-r--r--fs/cachefiles/rdwr.c12
-rw-r--r--fs/ceph/caps.c5
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c4
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/mds_client.c8
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/quota.c4
-rw-r--r--fs/cifs/cifsglob.h3
-rw-r--r--fs/cifs/cifssmb.c6
-rw-r--r--fs/cifs/connect.c6
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/misc.c82
-rw-r--r--fs/cifs/smb2ops.c5
-rw-r--r--fs/cifs/smb2pdu.c15
-rw-r--r--fs/cifs/smb2transport.c4
-rw-r--r--fs/compat_binfmt_elf.c4
-rw-r--r--fs/configfs/dir.c1
-rw-r--r--fs/coredump.c10
-rw-r--r--fs/crypto/crypto.c15
-rw-r--r--fs/crypto/fname.c59
-rw-r--r--fs/crypto/fscrypt_private.h111
-rw-r--r--fs/crypto/hkdf.c6
-rw-r--r--fs/crypto/hooks.c4
-rw-r--r--fs/crypto/keyring.c122
-rw-r--r--fs/crypto/keysetup.c109
-rw-r--r--fs/crypto/policy.c195
-rw-r--r--fs/debugfs/file.c15
-rw-r--r--fs/ecryptfs/crypto.c17
-rw-r--r--fs/eventpoll.c107
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exfat/balloc.c3
-rw-r--r--fs/exfat/exfat_fs.h1
-rw-r--r--fs/exfat/file.c15
-rw-r--r--fs/exfat/misc.c14
-rw-r--r--fs/exfat/namei.c8
-rw-r--r--fs/exfat/super.c72
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/ext4.h9
-rw-r--r--fs/ext4/ext4_jbd2.c3
-rw-r--r--fs/ext4/extents.c39
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/ioctl.c33
-rw-r--r--fs/ext4/mballoc.c6
-rw-r--r--fs/ext4/super.c72
-rw-r--r--fs/ext4/sysfs.c2
-rw-r--r--fs/f2fs/f2fs.h4
-rw-r--r--fs/f2fs/hash.c1
-rw-r--r--fs/f2fs/super.c85
-rw-r--r--fs/f2fs/sysfs.c4
-rw-r--r--fs/file.c2
-rw-r--r--fs/gfs2/bmap.c16
-rw-r--r--fs/gfs2/glock.c6
-rw-r--r--fs/gfs2/inode.c7
-rw-r--r--fs/gfs2/log.c11
-rw-r--r--fs/gfs2/lops.c26
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/gfs2/quota.c13
-rw-r--r--fs/gfs2/quota.h3
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/gfs2/util.c10
-rw-r--r--fs/io_uring.c485
-rw-r--r--fs/ioctl.c8
-rw-r--r--fs/iomap/fiemap.c5
-rw-r--r--fs/nfs/fscache.c39
-rw-r--r--fs/nfs/mount_clnt.c3
-rw-r--r--fs/nfs/nfs3acl.c22
-rw-r--r--fs/nfs/nfs4proc.c13
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/pagelist.c5
-rw-r--r--fs/nfs/pnfs.c14
-rw-r--r--fs/nfs/pnfs_nfs.c3
-rw-r--r--fs/nfs/super.c3
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nfsd/nfs4callback.c4
-rw-r--r--fs/nfsd/nfs4recover.c26
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/notify/fanotify/fanotify.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c27
-rw-r--r--fs/overlayfs/export.c3
-rw-r--r--fs/overlayfs/inode.c18
-rw-r--r--fs/pnode.c9
-rw-r--r--fs/proc/base.c15
-rw-r--r--fs/proc/meminfo.c4
-rw-r--r--fs/proc/root.c7
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--fs/proc/vmcore.c5
-rw-r--r--fs/pstore/Kconfig109
-rw-r--r--fs/pstore/Makefile6
-rw-r--r--fs/pstore/blk.c517
-rw-r--r--fs/pstore/ftrace.c54
-rw-r--r--fs/pstore/inode.c129
-rw-r--r--fs/pstore/internal.h11
-rw-r--r--fs/pstore/platform.c117
-rw-r--r--fs/pstore/ram.c155
-rw-r--r--fs/pstore/zone.c1465
-rw-r--r--fs/splice.c47
-rw-r--r--fs/squashfs/decompressor_multi_percpu.c21
-rw-r--r--fs/super.c2
-rw-r--r--fs/ubifs/auth.c37
-rw-r--r--fs/ubifs/file.c6
-rw-r--r--fs/ubifs/master.c9
-rw-r--r--fs/ubifs/replay.c27
-rw-r--r--fs/vboxsf/super.c2
-rw-r--r--fs/verity/enable.c2
-rw-r--r--fs/verity/fsverity_private.h4
-rw-r--r--fs/verity/measure.c2
-rw-r--r--fs/verity/open.c1
-rw-r--r--fs/verity/signature.c3
-rw-r--r--fs/verity/verify.c3
-rw-r--r--fs/xattr.c6
-rw-r--r--fs/xfs/xfs_icache.c10
-rw-r--r--fs/xfs/xfs_ioctl.c5
-rw-r--r--fs/xfs/xfs_mount.h6
-rw-r--r--fs/xfs/xfs_reflink.c1
-rw-r--r--fs/xfs/xfs_super.c40
144 files changed, 4313 insertions, 1363 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 3fbbd54f50fd..04f86b8c100e 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -36,6 +36,12 @@ config COMPAT_BINFMT_ELF
config ARCH_BINFMT_ELF_STATE
bool
+config ARCH_HAVE_ELF_PROT
+ bool
+
+config ARCH_USE_GNU_PROPERTY
+ bool
+
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 6765949b3aab..380ad5ace7cf 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -169,7 +169,7 @@ static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server)
spin_lock(&server->probe_lock);
- if (!test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) {
+ if (!test_and_set_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) {
server->cm_epoch = call->epoch;
server->probe.cm_epoch = call->epoch;
goto out;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5c794f4b051a..d1e1caa23c8b 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1032,7 +1032,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct inode *inode;
struct key *key;
- afs_dataversion_t dir_version;
+ afs_dataversion_t dir_version, invalid_before;
long de_version;
int ret;
@@ -1084,8 +1084,8 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (de_version == (long)dir_version)
goto out_valid_noupdate;
- dir_version = dir->invalid_before;
- if (de_version - (long)dir_version >= 0)
+ invalid_before = dir->invalid_before;
+ if (de_version - (long)invalid_before >= 0)
goto out_valid;
_debug("dir modified");
@@ -1275,6 +1275,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct afs_fs_cursor fc;
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct key *key;
+ afs_dataversion_t data_version;
int ret;
mode |= S_IFDIR;
@@ -1295,7 +1296,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1316,10 +1317,14 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto error_key;
}
- if (ret == 0 &&
- test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
- afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
- afs_edit_dir_for_create);
+ if (ret == 0) {
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
+ afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
+ afs_edit_dir_for_create);
+ up_write(&dvnode->validate_lock);
+ }
key_put(key);
kfree(scb);
@@ -1360,6 +1365,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
struct afs_fs_cursor fc;
struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL;
struct key *key;
+ afs_dataversion_t data_version;
int ret;
_enter("{%llx:%llu},{%pd}",
@@ -1391,7 +1397,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1404,9 +1410,12 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
ret = afs_end_vnode_operation(&fc);
if (ret == 0) {
afs_dir_remove_subdir(dentry);
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_remove(dvnode, &dentry->d_name,
afs_edit_dir_for_rmdir);
+ up_write(&dvnode->validate_lock);
}
}
@@ -1544,10 +1553,15 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
ret = afs_end_vnode_operation(&fc);
if (ret == 0 && !(scb[1].have_status || scb[1].have_error))
ret = afs_dir_remove_link(dvnode, dentry, key);
- if (ret == 0 &&
- test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
- afs_edit_dir_remove(dvnode, &dentry->d_name,
- afs_edit_dir_for_unlink);
+
+ if (ret == 0) {
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
+ afs_edit_dir_remove(dvnode, &dentry->d_name,
+ afs_edit_dir_for_unlink);
+ up_write(&dvnode->validate_lock);
+ }
}
if (need_rehash && ret < 0 && ret != -ENOENT)
@@ -1573,6 +1587,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct afs_status_cb *scb;
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct key *key;
+ afs_dataversion_t data_version;
int ret;
mode |= S_IFREG;
@@ -1597,7 +1612,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1618,9 +1633,12 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
goto error_key;
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
afs_edit_dir_for_create);
+ up_write(&dvnode->validate_lock);
kfree(scb);
key_put(key);
@@ -1648,6 +1666,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct afs_vnode *vnode = AFS_FS_I(d_inode(from));
struct key *key;
+ afs_dataversion_t data_version;
int ret;
_enter("{%llx:%llu},{%llx:%llu},{%pd}",
@@ -1672,7 +1691,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) {
afs_end_vnode_operation(&fc);
@@ -1702,9 +1721,12 @@ static int afs_link(struct dentry *from, struct inode *dir,
goto error_key;
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_add(dvnode, &dentry->d_name, &vnode->fid,
afs_edit_dir_for_link);
+ up_write(&dvnode->validate_lock);
key_put(key);
kfree(scb);
@@ -1732,6 +1754,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
struct afs_status_cb *scb;
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct key *key;
+ afs_dataversion_t data_version;
int ret;
_enter("{%llx:%llu},{%pd},%s",
@@ -1759,7 +1782,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1780,9 +1803,12 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
goto error_key;
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
afs_edit_dir_for_symlink);
+ up_write(&dvnode->validate_lock);
key_put(key);
kfree(scb);
@@ -1812,6 +1838,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *tmp = NULL, *rehash = NULL;
struct inode *new_inode;
struct key *key;
+ afs_dataversion_t orig_data_version;
+ afs_dataversion_t new_data_version;
bool new_negative = d_is_negative(new_dentry);
int ret;
@@ -1890,10 +1918,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, orig_dvnode, key, true)) {
- afs_dataversion_t orig_data_version;
- afs_dataversion_t new_data_version;
- struct afs_status_cb *new_scb = &scb[1];
-
orig_data_version = orig_dvnode->status.data_version + 1;
if (orig_dvnode != new_dvnode) {
@@ -1904,7 +1928,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_data_version = new_dvnode->status.data_version + 1;
} else {
new_data_version = orig_data_version;
- new_scb = &scb[0];
}
while (afs_select_fileserver(&fc)) {
@@ -1912,7 +1935,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode);
afs_fs_rename(&fc, old_dentry->d_name.name,
new_dvnode, new_dentry->d_name.name,
- &scb[0], new_scb);
+ &scb[0], &scb[1]);
}
afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break,
@@ -1930,18 +1953,25 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret == 0) {
if (rehash)
d_rehash(rehash);
- if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags))
- afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name,
- afs_edit_dir_for_rename_0);
+ down_write(&orig_dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+ orig_dvnode->status.data_version == orig_data_version)
+ afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name,
+ afs_edit_dir_for_rename_0);
+ if (orig_dvnode != new_dvnode) {
+ up_write(&orig_dvnode->validate_lock);
- if (!new_negative &&
- test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags))
- afs_edit_dir_remove(new_dvnode, &new_dentry->d_name,
- afs_edit_dir_for_rename_1);
+ down_write(&new_dvnode->validate_lock);
+ }
+ if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) &&
+ orig_dvnode->status.data_version == new_data_version) {
+ if (!new_negative)
+ afs_edit_dir_remove(new_dvnode, &new_dentry->d_name,
+ afs_edit_dir_for_rename_1);
- if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags))
afs_edit_dir_add(new_dvnode, &new_dentry->d_name,
&vnode->fid, afs_edit_dir_for_rename_2);
+ }
new_inode = d_inode(new_dentry);
if (new_inode) {
@@ -1957,14 +1987,10 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
* Note that if we ever implement RENAME_EXCHANGE, we'll have
* to update both dentries with opposing dir versions.
*/
- if (new_dvnode != orig_dvnode) {
- afs_update_dentry_version(&fc, old_dentry, &scb[1]);
- afs_update_dentry_version(&fc, new_dentry, &scb[1]);
- } else {
- afs_update_dentry_version(&fc, old_dentry, &scb[0]);
- afs_update_dentry_version(&fc, new_dentry, &scb[0]);
- }
+ afs_update_dentry_version(&fc, old_dentry, &scb[1]);
+ afs_update_dentry_version(&fc, new_dentry, &scb[1]);
d_move(old_dentry, new_dentry);
+ up_write(&new_dvnode->validate_lock);
goto error_tmp;
}
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 361088a5edb9..d94e2b7cddff 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -21,6 +21,7 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
{
struct afs_fs_cursor fc;
struct afs_status_cb *scb;
+ afs_dataversion_t dir_data_version;
int ret = -ERESTARTSYS;
_enter("%pd,%pd", old, new);
@@ -31,7 +32,7 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
trace_afs_silly_rename(vnode, false);
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t dir_data_version = dvnode->status.data_version + 1;
+ dir_data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -54,12 +55,15 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
dvnode->silly_key = key_get(key);
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == dir_data_version) {
afs_edit_dir_remove(dvnode, &old->d_name,
afs_edit_dir_for_silly_0);
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_edit_dir_add(dvnode, &new->d_name,
&vnode->fid, afs_edit_dir_for_silly_1);
+ }
+ up_write(&dvnode->validate_lock);
}
kfree(scb);
@@ -181,10 +185,14 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
}
}
- if (ret == 0 &&
- test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
- afs_edit_dir_remove(dvnode, &dentry->d_name,
- afs_edit_dir_for_unlink);
+ if (ret == 0) {
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == dir_data_version)
+ afs_edit_dir_remove(dvnode, &dentry->d_name,
+ afs_edit_dir_for_unlink);
+ up_write(&dvnode->validate_lock);
+ }
}
kfree(scb);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index e1b9ed679045..37d1bba57b00 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -32,9 +32,8 @@ void afs_fileserver_probe_result(struct afs_call *call)
struct afs_server *server = call->server;
unsigned int server_index = call->server_index;
unsigned int index = call->addr_ix;
- unsigned int rtt = UINT_MAX;
+ unsigned int rtt_us = 0;
bool have_result = false;
- u64 _rtt;
int ret = call->error;
_enter("%pU,%u", &server->uuid, index);
@@ -93,15 +92,9 @@ responded:
}
}
- /* Get the RTT and scale it to fit into a 32-bit value that represents
- * over a minute of time so that we can access it with one instruction
- * on a 32-bit system.
- */
- _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
- _rtt /= 64;
- rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt;
- if (rtt < server->probe.rtt) {
- server->probe.rtt = rtt;
+ rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
+ if (rtt_us < server->probe.rtt) {
+ server->probe.rtt = rtt_us;
alist->preferred = index;
have_result = true;
}
@@ -113,15 +106,11 @@ out:
spin_unlock(&server->probe_lock);
_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
- server_index, index, &alist->addrs[index].transport,
- (unsigned int)rtt, ret);
+ server_index, index, &alist->addrs[index].transport, rtt_us, ret);
have_result |= afs_fs_probe_done(server);
- if (have_result) {
- server->probe.have_result = true;
- wake_up_var(&server->probe.have_result);
+ if (have_result)
wake_up_all(&server->probe_wq);
- }
}
/*
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 1f9c5d8e6fe5..d2b3798c1932 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -65,6 +65,7 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus);
u64 data_version, size;
u32 type, abort_code;
+ int ret;
abort_code = ntohl(xdr->abort_code);
@@ -78,7 +79,7 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
*/
status->abort_code = abort_code;
scb->have_error = true;
- return 0;
+ goto good;
}
pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version));
@@ -87,7 +88,8 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
if (abort_code != 0 && inline_error) {
status->abort_code = abort_code;
- return 0;
+ scb->have_error = true;
+ goto good;
}
type = ntohl(xdr->type);
@@ -123,13 +125,16 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
data_version |= (u64)ntohl(xdr->data_version_hi) << 32;
status->data_version = data_version;
scb->have_status = true;
-
+good:
+ ret = 0;
+advance:
*_bp = (const void *)*_bp + sizeof(*xdr);
- return 0;
+ return ret;
bad:
xdr_dump_bad(*_bp);
- return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ ret = afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ goto advance;
}
static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
@@ -380,8 +385,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
ASSERTCMP(req->offset, <=, PAGE_SIZE);
if (req->offset == PAGE_SIZE) {
req->offset = 0;
- if (req->page_done)
- req->page_done(req);
req->index++;
if (req->remain > 0)
goto begin_page;
@@ -435,11 +438,13 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
if (req->offset < PAGE_SIZE)
zero_user_segment(req->pages[req->index],
req->offset, PAGE_SIZE);
- if (req->page_done)
- req->page_done(req);
req->offset = 0;
}
+ if (req->page_done)
+ for (req->index = 0; req->index < req->nr_pages; req->index++)
+ req->page_done(req);
+
_leave(" = 0 [done]");
return 0;
}
@@ -981,16 +986,16 @@ static int afs_deliver_fs_rename(struct afs_call *call)
if (ret < 0)
return ret;
- /* unmarshall the reply once we've received all of it */
+ /* If the two dirs are the same, we have two copies of the same status
+ * report, so we just decode it twice.
+ */
bp = call->buffer;
ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
if (ret < 0)
return ret;
- if (call->out_dir_scb != call->out_scb) {
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- }
+ ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+ if (ret < 0)
+ return ret;
xdr_decode_AFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index ef732dd4e7ef..80255513e230 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -533,12 +533,10 @@ struct afs_server {
u32 abort_code;
u32 cm_epoch;
short error;
- bool have_result;
bool responded:1;
bool is_yfs:1;
bool not_yfs:1;
bool local_failure:1;
- bool no_epoch:1;
bool cm_probed:1;
bool said_rebooted:1;
bool said_inconsistent:1;
@@ -1335,7 +1333,7 @@ extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
extern void afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
extern void afs_put_volume(struct afs_cell *, struct afs_volume *);
-extern int afs_check_volume_status(struct afs_volume *, struct key *);
+extern int afs_check_volume_status(struct afs_volume *, struct afs_fs_cursor *);
/*
* write.c
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 172ba569cd60..2a3305e42b14 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -192,7 +192,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
write_unlock(&vnode->volume->servers_lock);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
- error = afs_check_volume_status(vnode->volume, fc->key);
+ error = afs_check_volume_status(vnode->volume, fc);
if (error < 0)
goto failed_set_error;
@@ -281,7 +281,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
- error = afs_check_volume_status(vnode->volume, fc->key);
+ error = afs_check_volume_status(vnode->volume, fc);
if (error < 0)
goto failed_set_error;
@@ -341,7 +341,7 @@ start:
/* See if we need to do an update of the volume record. Note that the
* volume may have moved or even have been deleted.
*/
- error = afs_check_volume_status(vnode->volume, fc->key);
+ error = afs_check_volume_status(vnode->volume, fc);
if (error < 0)
goto failed_set_error;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index b7f3cb2130ca..11b90ac7ea30 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -594,12 +594,9 @@ retry:
}
ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING,
- TASK_INTERRUPTIBLE);
+ (fc->flags & AFS_FS_CURSOR_INTR) ?
+ TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
if (ret == -ERESTARTSYS) {
- if (!(fc->flags & AFS_FS_CURSOR_INTR) && server->addresses) {
- _leave(" = t [intr]");
- return true;
- }
fc->error = ret;
_leave(" = f [intr]");
return false;
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index 858498cc1b05..e3aa013c2177 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -31,10 +31,9 @@ void afs_vlserver_probe_result(struct afs_call *call)
struct afs_addr_list *alist = call->alist;
struct afs_vlserver *server = call->vlserver;
unsigned int server_index = call->server_index;
+ unsigned int rtt_us = 0;
unsigned int index = call->addr_ix;
- unsigned int rtt = UINT_MAX;
bool have_result = false;
- u64 _rtt;
int ret = call->error;
_enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code);
@@ -93,15 +92,9 @@ responded:
}
}
- /* Get the RTT and scale it to fit into a 32-bit value that represents
- * over a minute of time so that we can access it with one instruction
- * on a 32-bit system.
- */
- _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
- _rtt /= 64;
- rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt;
- if (rtt < server->probe.rtt) {
- server->probe.rtt = rtt;
+ rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
+ if (rtt_us < server->probe.rtt) {
+ server->probe.rtt = rtt_us;
alist->preferred = index;
have_result = true;
}
@@ -113,8 +106,7 @@ out:
spin_unlock(&server->probe_lock);
_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
- server_index, index, &alist->addrs[index].transport,
- (unsigned int)rtt, ret);
+ server_index, index, &alist->addrs[index].transport, rtt_us, ret);
have_result |= afs_vl_probe_done(server);
if (have_result) {
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index 9a5ce9687779..72eacc14e6e1 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -302,8 +302,8 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
pr_notice("VC: - nr=%u/%u/%u pf=%u\n",
a->nr_ipv4, a->nr_addrs, a->max_addrs,
a->preferred);
- pr_notice("VC: - pr=%lx R=%lx F=%lx\n",
- a->probed, a->responded, a->failed);
+ pr_notice("VC: - R=%lx F=%lx\n",
+ a->responded, a->failed);
if (a == vc->ac.alist)
pr_notice("VC: - current\n");
}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 92ca5e27573b..4310336b9bb8 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -281,7 +281,7 @@ error:
/*
* Make sure the volume record is up to date.
*/
-int afs_check_volume_status(struct afs_volume *volume, struct key *key)
+int afs_check_volume_status(struct afs_volume *volume, struct afs_fs_cursor *fc)
{
time64_t now = ktime_get_real_seconds();
int ret, retries = 0;
@@ -299,7 +299,7 @@ retry:
}
if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) {
- ret = afs_update_volume_status(volume, key);
+ ret = afs_update_volume_status(volume, fc->key);
clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags);
clear_bit_unlock(AFS_VOLUME_UPDATING, &volume->flags);
wake_up_bit(&volume->flags, AFS_VOLUME_WAIT);
@@ -312,7 +312,9 @@ retry:
return 0;
}
- ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, TASK_INTERRUPTIBLE);
+ ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT,
+ (fc->flags & AFS_FS_CURSOR_INTR) ?
+ TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
if (ret == -ERESTARTSYS) {
_leave(" = %d", ret);
return ret;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index a26126ac7bf1..fe413e7a5cf4 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -165,15 +165,15 @@ static void xdr_dump_bad(const __be32 *bp)
int i;
pr_notice("YFS XDR: Bad status record\n");
- for (i = 0; i < 5 * 4 * 4; i += 16) {
+ for (i = 0; i < 6 * 4 * 4; i += 16) {
memcpy(x, bp, 16);
bp += 4;
pr_notice("%03x: %08x %08x %08x %08x\n",
i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3]));
}
- memcpy(x, bp, 4);
- pr_notice("0x50: %08x\n", ntohl(x[0]));
+ memcpy(x, bp, 8);
+ pr_notice("0x60: %08x %08x\n", ntohl(x[0]), ntohl(x[1]));
}
/*
@@ -186,13 +186,14 @@ static int xdr_decode_YFSFetchStatus(const __be32 **_bp,
const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp;
struct afs_file_status *status = &scb->status;
u32 type;
+ int ret;
status->abort_code = ntohl(xdr->abort_code);
if (status->abort_code != 0) {
if (status->abort_code == VNOVNODE)
status->nlink = 0;
scb->have_error = true;
- return 0;
+ goto good;
}
type = ntohl(xdr->type);
@@ -220,13 +221,16 @@ static int xdr_decode_YFSFetchStatus(const __be32 **_bp,
status->size = xdr_to_u64(xdr->size);
status->data_version = xdr_to_u64(xdr->data_version);
scb->have_status = true;
-
+good:
+ ret = 0;
+advance:
*_bp += xdr_size(xdr);
- return 0;
+ return ret;
bad:
xdr_dump_bad(*_bp);
- return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ ret = afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ goto advance;
}
/*
@@ -493,8 +497,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
ASSERTCMP(req->offset, <=, PAGE_SIZE);
if (req->offset == PAGE_SIZE) {
req->offset = 0;
- if (req->page_done)
- req->page_done(req);
req->index++;
if (req->remain > 0)
goto begin_page;
@@ -552,11 +554,13 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
if (req->offset < PAGE_SIZE)
zero_user_segment(req->pages[req->index],
req->offset, PAGE_SIZE);
- if (req->page_done)
- req->page_done(req);
req->offset = 0;
}
+ if (req->page_done)
+ for (req->index = 0; req->index < req->nr_pages; req->index++)
+ req->page_done(req);
+
_leave(" = 0 [done]");
return 0;
}
@@ -1153,11 +1157,9 @@ static int yfs_deliver_fs_rename(struct afs_call *call)
ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
if (ret < 0)
return ret;
- if (call->out_dir_scb != call->out_scb) {
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- }
+ ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+ if (ret < 0)
+ return ret;
xdr_decode_YFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 13f25e241ac4..92402c5606a1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -40,12 +40,18 @@
#include <linux/sched/coredump.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
#include <linux/cred.h>
#include <linux/dax.h>
#include <linux/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
+#ifndef ELF_COMPAT
+#define ELF_COMPAT 0
+#endif
+
#ifndef user_long_t
#define user_long_t long
#endif
@@ -539,7 +545,8 @@ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
#endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */
-static inline int make_prot(u32 p_flags)
+static inline int make_prot(u32 p_flags, struct arch_elf_state *arch_state,
+ bool has_interp, bool is_interp)
{
int prot = 0;
@@ -549,7 +556,8 @@ static inline int make_prot(u32 p_flags)
prot |= PROT_WRITE;
if (p_flags & PF_X)
prot |= PROT_EXEC;
- return prot;
+
+ return arch_elf_adjust_prot(prot, arch_state, has_interp, is_interp);
}
/* This is much more generalized than the library routine read function,
@@ -559,7 +567,8 @@ static inline int make_prot(u32 p_flags)
static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
struct file *interpreter,
- unsigned long no_base, struct elf_phdr *interp_elf_phdata)
+ unsigned long no_base, struct elf_phdr *interp_elf_phdata,
+ struct arch_elf_state *arch_state)
{
struct elf_phdr *eppnt;
unsigned long load_addr = 0;
@@ -591,7 +600,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
if (eppnt->p_type == PT_LOAD) {
int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
- int elf_prot = make_prot(eppnt->p_flags);
+ int elf_prot = make_prot(eppnt->p_flags, arch_state,
+ true, true);
unsigned long vaddr = 0;
unsigned long k, map_addr;
@@ -682,6 +692,111 @@ out:
* libraries. There is no binary dependent code anywhere else.
*/
+static int parse_elf_property(const char *data, size_t *off, size_t datasz,
+ struct arch_elf_state *arch,
+ bool have_prev_type, u32 *prev_type)
+{
+ size_t o, step;
+ const struct gnu_property *pr;
+ int ret;
+
+ if (*off == datasz)
+ return -ENOENT;
+
+ if (WARN_ON_ONCE(*off > datasz || *off % ELF_GNU_PROPERTY_ALIGN))
+ return -EIO;
+ o = *off;
+ datasz -= *off;
+
+ if (datasz < sizeof(*pr))
+ return -ENOEXEC;
+ pr = (const struct gnu_property *)(data + o);
+ o += sizeof(*pr);
+ datasz -= sizeof(*pr);
+
+ if (pr->pr_datasz > datasz)
+ return -ENOEXEC;
+
+ WARN_ON_ONCE(o % ELF_GNU_PROPERTY_ALIGN);
+ step = round_up(pr->pr_datasz, ELF_GNU_PROPERTY_ALIGN);
+ if (step > datasz)
+ return -ENOEXEC;
+
+ /* Properties are supposed to be unique and sorted on pr_type: */
+ if (have_prev_type && pr->pr_type <= *prev_type)
+ return -ENOEXEC;
+ *prev_type = pr->pr_type;
+
+ ret = arch_parse_elf_property(pr->pr_type, data + o,
+ pr->pr_datasz, ELF_COMPAT, arch);
+ if (ret)
+ return ret;
+
+ *off = o + step;
+ return 0;
+}
+
+#define NOTE_DATA_SZ SZ_1K
+#define GNU_PROPERTY_TYPE_0_NAME "GNU"
+#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME))
+
+static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
+ struct arch_elf_state *arch)
+{
+ union {
+ struct elf_note nhdr;
+ char data[NOTE_DATA_SZ];
+ } note;
+ loff_t pos;
+ ssize_t n;
+ size_t off, datasz;
+ int ret;
+ bool have_prev_type;
+ u32 prev_type;
+
+ if (!IS_ENABLED(CONFIG_ARCH_USE_GNU_PROPERTY) || !phdr)
+ return 0;
+
+ /* load_elf_binary() shouldn't call us unless this is true... */
+ if (WARN_ON_ONCE(phdr->p_type != PT_GNU_PROPERTY))
+ return -ENOEXEC;
+
+ /* If the properties are crazy large, that's too bad (for now): */
+ if (phdr->p_filesz > sizeof(note))
+ return -ENOEXEC;
+
+ pos = phdr->p_offset;
+ n = kernel_read(f, &note, phdr->p_filesz, &pos);
+
+ BUILD_BUG_ON(sizeof(note) < sizeof(note.nhdr) + NOTE_NAME_SZ);
+ if (n < 0 || n < sizeof(note.nhdr) + NOTE_NAME_SZ)
+ return -EIO;
+
+ if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 ||
+ note.nhdr.n_namesz != NOTE_NAME_SZ ||
+ strncmp(note.data + sizeof(note.nhdr),
+ GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr)))
+ return -ENOEXEC;
+
+ off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ,
+ ELF_GNU_PROPERTY_ALIGN);
+ if (off > n)
+ return -ENOEXEC;
+
+ if (note.nhdr.n_descsz > n - off)
+ return -ENOEXEC;
+ datasz = off + note.nhdr.n_descsz;
+
+ have_prev_type = false;
+ do {
+ ret = parse_elf_property(note.data, &off, datasz, arch,
+ have_prev_type, &prev_type);
+ have_prev_type = true;
+ } while (!ret);
+
+ return ret == -ENOENT ? 0 : ret;
+}
+
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
@@ -689,6 +804,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
int load_addr_set = 0;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
+ struct elf_phdr *elf_property_phdata = NULL;
unsigned long elf_bss, elf_brk;
int bss_prot = 0;
int retval, i;
@@ -726,6 +842,11 @@ static int load_elf_binary(struct linux_binprm *bprm)
for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) {
char *elf_interpreter;
+ if (elf_ppnt->p_type == PT_GNU_PROPERTY) {
+ elf_property_phdata = elf_ppnt;
+ continue;
+ }
+
if (elf_ppnt->p_type != PT_INTERP)
continue;
@@ -819,9 +940,14 @@ out_free_interp:
goto out_free_dentry;
/* Pass PT_LOPROC..PT_HIPROC headers to arch code */
+ elf_property_phdata = NULL;
elf_ppnt = interp_elf_phdata;
for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++)
switch (elf_ppnt->p_type) {
+ case PT_GNU_PROPERTY:
+ elf_property_phdata = elf_ppnt;
+ break;
+
case PT_LOPROC ... PT_HIPROC:
retval = arch_elf_pt_proc(interp_elf_ex,
elf_ppnt, interpreter,
@@ -832,6 +958,11 @@ out_free_interp:
}
}
+ retval = parse_elf_properties(interpreter ?: bprm->file,
+ elf_property_phdata, &arch_state);
+ if (retval)
+ goto out_free_dentry;
+
/*
* Allow arch code to reject the ELF at this point, whilst it's
* still possible to return an error to the code that invoked
@@ -913,7 +1044,8 @@ out_free_interp:
}
}
- elf_prot = make_prot(elf_ppnt->p_flags);
+ elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
+ !!interpreter, false);
elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
@@ -1056,7 +1188,8 @@ out_free_interp:
if (interpreter) {
elf_entry = load_elf_interp(interp_elf_ex,
interpreter,
- load_bias, interp_elf_phdata);
+ load_bias, interp_elf_phdata,
+ &arch_state);
if (!IS_ERR((void *)elf_entry)) {
/*
* load_elf_interp() returns relocation
@@ -1733,7 +1866,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
(!regset->active || regset->active(t->task, regset) > 0)) {
int ret;
size_t size = regset_size(t->task, regset);
- void *data = kmalloc(size, GFP_KERNEL);
+ void *data = kzalloc(size, GFP_KERNEL);
if (unlikely(!data))
return 0;
ret = regset->get(t->task, regset,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 52b6f646cdbd..93672c3f1c78 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -19,7 +19,6 @@
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
-#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagevec.h>
@@ -1893,6 +1892,16 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
struct gendisk *disk = bdev->bd_disk;
struct block_device *victim = NULL;
+ /*
+ * Sync early if it looks like we're the last one. If someone else
+ * opens the block device between now and the decrement of bd_openers
+ * then we did a sync that we didn't need to, but that's not the end
+ * of the world and we want to avoid long (could be several minute)
+ * syncs while holding the mutex.
+ */
+ if (bdev->bd_openers == 1)
+ sync_blockdev(bdev);
+
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (for_part)
bdev->bd_part_count--;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9c380e7edf62..0cc02577577b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -391,7 +391,7 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
struct rb_node **p = &preftrees->direct.root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct prelim_ref *ref = NULL;
- struct prelim_ref target = {0};
+ struct prelim_ref target = {};
int result;
target.parent = bytenr;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 786849fcc319..696f47103cfc 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -916,7 +916,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto out;
+ goto out_put_group;
}
/*
@@ -954,7 +954,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(inode);
- goto out;
+ goto out_put_group;
}
clear_nlink(inode);
/* One for the block groups ref */
@@ -977,13 +977,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ goto out_put_group;
if (ret > 0)
btrfs_release_path(path);
if (ret == 0) {
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
- goto out;
+ goto out_put_group;
btrfs_release_path(path);
}
@@ -1102,9 +1102,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = remove_block_group_free_space(trans, block_group);
if (ret)
- goto out;
+ goto out_put_group;
- btrfs_put_block_group(block_group);
+ /* Once for the block groups rbtree */
btrfs_put_block_group(block_group);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
@@ -1127,6 +1127,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
/* once for the tree */
free_extent_map(em);
}
+
+out_put_group:
+ /* Once for the lookup reference */
+ btrfs_put_block_group(block_group);
out:
if (remove_rsv)
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -1288,11 +1292,15 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
if (ret)
goto err;
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ if (prev_trans)
+ btrfs_put_transaction(prev_trans);
return true;
err:
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ if (prev_trans)
+ btrfs_put_transaction(prev_trans);
btrfs_dec_block_group_ro(bg);
return false;
}
@@ -3370,6 +3378,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
space_info->bytes_reserved > 0 ||
space_info->bytes_may_use > 0))
btrfs_dump_space_info(info, space_info, 0, 0);
+ WARN_ON(space_info->reclaim_size > 0);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
}
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
index 21a15776dac4..353228d62f5a 100644
--- a/fs/btrfs/discard.h
+++ b/fs/btrfs/discard.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_DISCARD_H
#define BTRFS_DISCARD_H
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a6cb5cbbdb9f..d10c7be10f3b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2036,9 +2036,6 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++)
btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
-
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
- btrfs_free_log_root_tree(NULL, fs_info);
}
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
@@ -3888,7 +3885,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->fs_roots_radix_lock);
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
- btrfs_free_log(NULL, root);
+ ASSERT(root->log_root == NULL);
if (root->reloc_root) {
btrfs_put_root(root->reloc_root);
root->reloc_root = NULL;
@@ -4211,6 +4208,36 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
up_write(&fs_info->cleanup_work_sem);
}
+static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *gang[8];
+ u64 root_objectid = 0;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang))) != 0) {
+ int i;
+
+ for (i = 0; i < ret; i++)
+ gang[i] = btrfs_grab_root(gang[i]);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
+ continue;
+ root_objectid = gang[i]->root_key.objectid;
+ btrfs_free_log(NULL, gang[i]);
+ btrfs_put_root(gang[i]);
+ }
+ root_objectid++;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ btrfs_free_log_root_tree(NULL, fs_info);
+}
+
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
@@ -4603,6 +4630,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
btrfs_destroy_delayed_inodes(fs_info);
btrfs_assert_delayed_root_empty(fs_info);
btrfs_destroy_all_delalloc_inodes(fs_info);
+ btrfs_drop_all_logs(fs_info);
mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8a144f9cb7ac..719e68ab552c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2098,6 +2098,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
+ * If the inode needs a full sync, make sure we use a full range to
+ * avoid log tree corruption, due to hole detection racing with ordered
+ * extent completion for adjacent ranges and races between logging and
+ * completion of ordered extents for adjancent ranges - both races
+ * could lead to file extent items in the log with overlapping ranges.
+ * Do this while holding the inode lock, to avoid races with other
+ * tasks.
+ */
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ start = 0;
+ end = LLONG_MAX;
+ }
+
+ /*
* Before we acquired the inode's lock, someone may have dirtied more
* pages in the target range. We need to make sure that writeback for
* any such pages does not start while we are logging the inode, because
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index d1973141d3bb..040009d1cc31 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -264,6 +264,7 @@ copy_inline_extent:
size);
inode_add_bytes(dst, datal);
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
out:
if (!ret && !trans) {
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f65595602aa8..03bc7134e8cb 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -611,8 +611,8 @@ static int should_ignore_root(struct btrfs_root *root)
if (!reloc_root)
return 0;
- if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
- root->fs_info->running_transaction->transid - 1)
+ if (btrfs_header_generation(reloc_root->commit_root) ==
+ root->fs_info->running_transaction->transid)
return 0;
/*
* if there is reloc tree and it was created in previous
@@ -1527,8 +1527,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int clear_rsv = 0;
int ret;
- if (!rc || !rc->create_reloc_tree ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (!rc)
return 0;
/*
@@ -1538,12 +1537,28 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
if (reloc_root_is_dead(root))
return 0;
+ /*
+ * This is subtle but important. We do not do
+ * record_root_in_transaction for reloc roots, instead we record their
+ * corresponding fs root, and then here we update the last trans for the
+ * reloc root. This means that we have to do this for the entire life
+ * of the reloc root, regardless of which stage of the relocation we are
+ * in.
+ */
if (root->reloc_root) {
reloc_root = root->reloc_root;
reloc_root->last_trans = trans->transid;
return 0;
}
+ /*
+ * We are merging reloc roots, we do not need new reloc trees. Also
+ * reloc trees never need their own reloc tree.
+ */
+ if (!rc->create_reloc_tree ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
if (!trans->reloc_reserved) {
rsv = trans->block_rsv;
trans->block_rsv = rc->block_rsv;
@@ -4544,6 +4559,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
list_add_tail(&reloc_root->root_list, &reloc_roots);
+ btrfs_end_transaction(trans);
goto out_unset;
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 8b0fe053a25d..ff17a4420358 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -361,6 +361,16 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
return 0;
}
+static void remove_ticket(struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ if (!list_empty(&ticket->list)) {
+ list_del_init(&ticket->list);
+ ASSERT(space_info->reclaim_size >= ticket->bytes);
+ space_info->reclaim_size -= ticket->bytes;
+ }
+}
+
/*
* This is for space we already have accounted in space_info->bytes_may_use, so
* basically when we're returning space from block_rsv's.
@@ -388,9 +398,7 @@ again:
btrfs_space_info_update_bytes_may_use(fs_info,
space_info,
ticket->bytes);
- list_del_init(&ticket->list);
- ASSERT(space_info->reclaim_size >= ticket->bytes);
- space_info->reclaim_size -= ticket->bytes;
+ remove_ticket(space_info, ticket);
ticket->bytes = 0;
space_info->tickets_id++;
wake_up(&ticket->wait);
@@ -899,7 +907,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes);
- list_del_init(&ticket->list);
+ remove_ticket(space_info, ticket);
ticket->error = -ENOSPC;
wake_up(&ticket->wait);
@@ -1063,7 +1071,7 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
* despite getting an error, resulting in a space leak
* (bytes_may_use counter of our space_info).
*/
- list_del_init(&ticket->list);
+ remove_ticket(space_info, ticket);
ticket->error = -EINTR;
break;
}
@@ -1121,7 +1129,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
* either the async reclaim job deletes the ticket from the list
* or we delete it ourselves at wait_reserve_ticket().
*/
- list_del_init(&ticket->list);
+ remove_ticket(space_info, ticket);
if (!ret)
ret = -ENOSPC;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8cede6eb9843..2d5498136e5e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -662,10 +662,19 @@ again:
}
got_it:
- btrfs_record_root_in_trans(h, root);
-
if (!current->journal_info)
current->journal_info = h;
+
+ /*
+ * btrfs_record_root_in_trans() needs to alloc new extents, and may
+ * call btrfs_join_transaction() while we're also starting a
+ * transaction.
+ *
+ * Thus it need to be called after current->journal_info initialized,
+ * or we can deadlock.
+ */
+ btrfs_record_root_in_trans(h, root);
+
return h;
join_fail:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 58c111474ba5..02ebdd9edc19 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -96,8 +96,8 @@ enum {
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- u64 start,
- u64 end,
+ const loff_t start,
+ const loff_t end,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -4226,6 +4226,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
const u64 ino = btrfs_ino(inode);
struct btrfs_path *dst_path = NULL;
bool dropped_extents = false;
+ u64 truncate_offset = i_size;
+ struct extent_buffer *leaf;
+ int slot;
int ins_nr = 0;
int start_slot;
int ret;
@@ -4240,9 +4243,43 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
+ /*
+ * We must check if there is a prealloc extent that starts before the
+ * i_size and crosses the i_size boundary. This is to ensure later we
+ * truncate down to the end of that extent and not to the i_size, as
+ * otherwise we end up losing part of the prealloc extent after a log
+ * replay and with an implicit hole if there is another prealloc extent
+ * that starts at an offset beyond i_size.
+ */
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ struct btrfs_file_extent_item *ei;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 extent_end;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, ei);
+
+ if (extent_end > i_size)
+ truncate_offset = extent_end;
+ }
+ } else {
+ ret = 0;
+ }
+
while (true) {
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
+ leaf = path->nodes[0];
+ slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
if (ins_nr > 0) {
@@ -4280,7 +4317,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
ret = btrfs_truncate_inode_items(trans,
root->log_root,
&inode->vfs_inode,
- i_size,
+ truncate_offset,
BTRFS_EXTENT_DATA_KEY);
} while (ret == -EAGAIN);
if (ret)
@@ -4533,15 +4570,13 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
- struct btrfs_path *path,
- const u64 start,
- const u64 end)
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
const u64 i_size = i_size_read(&inode->vfs_inode);
- u64 prev_extent_end = start;
+ u64 prev_extent_end = 0;
int ret;
if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
@@ -4549,21 +4584,14 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
+ key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret > 0 && path->slots[0] > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
- if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
- path->slots[0]--;
- }
-
while (true) {
struct extent_buffer *leaf = path->nodes[0];
- u64 extent_end;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
@@ -4580,18 +4608,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
break;
- extent_end = btrfs_file_extent_end(path);
- if (extent_end <= start)
- goto next_slot;
-
/* We have a hole, log it. */
if (prev_extent_end < key.offset) {
- u64 hole_len;
-
- if (key.offset >= end)
- hole_len = end - prev_extent_end;
- else
- hole_len = key.offset - prev_extent_end;
+ const u64 hole_len = key.offset - prev_extent_end;
/*
* Release the path to avoid deadlocks with other code
@@ -4621,20 +4640,16 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
}
- prev_extent_end = min(extent_end, end);
- if (extent_end >= end)
- break;
-next_slot:
+ prev_extent_end = btrfs_file_extent_end(path);
path->slots[0]++;
cond_resched();
}
- if (prev_extent_end < end && prev_extent_end < i_size) {
+ if (prev_extent_end < i_size) {
u64 hole_len;
btrfs_release_path(path);
- hole_len = min(ALIGN(i_size, fs_info->sectorsize), end);
- hole_len -= prev_extent_end;
+ hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
ret = btrfs_insert_file_extent(trans, root->log_root,
ino, prev_extent_end, 0, 0,
hole_len, 0, hole_len,
@@ -4971,8 +4986,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
const u64 logged_isize,
const bool recursive_logging,
const int inode_only,
- const u64 start,
- const u64 end,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
@@ -4981,21 +4994,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
int ins_nr = 0;
int ret;
- /*
- * We must make sure we don't copy extent items that are entirely out of
- * the range [start, end - 1]. This is not just an optimization to avoid
- * copying but also needed to avoid a corruption where we end up with
- * file extent items in the log tree that have overlapping ranges - this
- * can happen if we race with ordered extent completion for ranges that
- * are outside our target range. For example we copy an extent item and
- * when we move to the next leaf, that extent was trimmed and a new one
- * covering a subrange of it, but with a higher key, was inserted - we
- * would then copy this other extent too, resulting in a log tree with
- * 2 extent items that represent overlapping ranges.
- *
- * We can copy the entire extents at the range bondaries however, even
- * if they cover an area outside the target range. That's ok.
- */
while (1) {
ret = btrfs_search_forward(root, min_key, path, trans->transid);
if (ret < 0)
@@ -5063,29 +5061,6 @@ again:
goto next_slot;
}
- if (min_key->type == BTRFS_EXTENT_DATA_KEY) {
- const u64 extent_end = btrfs_file_extent_end(path);
-
- if (extent_end <= start) {
- if (ins_nr > 0) {
- ret = copy_items(trans, inode, dst_path,
- path, ins_start_slot,
- ins_nr, inode_only,
- logged_isize);
- if (ret < 0)
- return ret;
- ins_nr = 0;
- }
- goto next_slot;
- }
- if (extent_end >= end) {
- ins_nr++;
- if (ins_nr == 1)
- ins_start_slot = path->slots[0];
- break;
- }
- }
-
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
ins_nr++;
goto next_slot;
@@ -5151,8 +5126,8 @@ next_key:
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- u64 start,
- u64 end,
+ const loff_t start,
+ const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -5180,9 +5155,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- start = ALIGN_DOWN(start, fs_info->sectorsize);
- end = ALIGN(end, fs_info->sectorsize);
-
min_key.objectid = ino;
min_key.type = BTRFS_INODE_ITEM_KEY;
min_key.offset = 0;
@@ -5298,8 +5270,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
- recursive_logging, inode_only,
- start, end, ctx, &need_log_inode_item);
+ recursive_logging, inode_only, ctx,
+ &need_log_inode_item);
if (err)
goto out_unlock;
@@ -5312,7 +5284,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_holes(trans, root, inode, path, start, end);
+ err = btrfs_log_holes(trans, root, inode, path);
if (err)
goto out_unlock;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index f73276d746bb..a60f60396cfa 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -967,7 +967,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
struct page *page;
struct buffer_head *bh;
sector_t end_block;
- int ret = 0; /* Will call free_more_memory() */
+ int ret = 0;
gfp_t gfp_mask;
gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
@@ -1371,6 +1371,17 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
}
EXPORT_SYMBOL(__breadahead);
+void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
+ gfp_t gfp)
+{
+ struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
+ if (likely(bh)) {
+ ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
+ brelse(bh);
+ }
+}
+EXPORT_SYMBOL(__breadahead_gfp);
+
/**
* __bread_gfp() - reads a specified block and returns the bh
* @bdev: the block_device to read from
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 1dc97f2d6201..e7726f5f1241 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -60,9 +60,9 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
object = container_of(op->op.object, struct cachefiles_object, fscache);
spin_lock(&object->work_lock);
list_add_tail(&monitor->op_link, &op->to_do);
+ fscache_enqueue_retrieval(op);
spin_unlock(&object->work_lock);
- fscache_enqueue_retrieval(op);
fscache_put_retrieval(op);
return 0;
}
@@ -398,7 +398,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
struct inode *inode;
sector_t block;
unsigned shift;
- int ret;
+ int ret, ret2;
object = container_of(op->op.object,
struct cachefiles_object, fscache);
@@ -430,8 +430,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
block = page->index;
block <<= shift;
- ret = bmap(inode, &block);
- ASSERT(ret < 0);
+ ret2 = bmap(inode, &block);
+ ASSERT(ret2 == 0);
_debug("%llx -> %llx",
(unsigned long long) (page->index << shift),
@@ -739,8 +739,8 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
block = page->index;
block <<= shift;
- ret = bmap(inode, &block);
- ASSERT(!ret);
+ ret2 = bmap(inode, &block);
+ ASSERT(ret2 == 0);
_debug("%llx -> %llx",
(unsigned long long) (page->index << shift),
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 185db76300b3..f1acde6fb9a6 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2749,7 +2749,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
ret = try_get_cap_refs(inode, need, want, 0, flags, got);
/* three special error codes */
- if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN)
+ if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE)
ret = 0;
return ret;
}
@@ -3746,6 +3746,7 @@ retry:
WARN_ON(1);
tsession = NULL;
target = -1;
+ mutex_lock(&session->s_mutex);
}
goto retry;
@@ -3990,7 +3991,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
__ceph_queue_cap_release(session, cap);
spin_unlock(&session->s_cap_lock);
}
- goto done;
+ goto flush_cap_releases;
}
/* these will work even if we don't have a cap yet */
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 481ac97b4d25..dcaed75de9e6 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -271,7 +271,7 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
&congestion_kb_fops);
snprintf(name, sizeof(name), "../../bdi/%s",
- dev_name(fsc->sb->s_bdi->dev));
+ bdi_dev_name(fsc->sb->s_bdi));
fsc->debugfs_bdi =
debugfs_create_symlink("bdi",
fsc->client->debugfs_dir,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d594c2627430..4c4202c93b71 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1051,8 +1051,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
/* If op failed, mark everyone involved for errors */
if (result) {
- int pathlen;
- u64 base;
+ int pathlen = 0;
+ u64 base = 0;
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4a5ccbb7e808..afdfca965a7f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -527,8 +527,8 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
if (result) {
struct dentry *dentry = req->r_dentry;
- int pathlen;
- u64 base;
+ int pathlen = 0;
+ u64 base = 0;
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 486f91f9685b..7c63abf5bea9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3251,8 +3251,7 @@ static void handle_session(struct ceph_mds_session *session,
void *end = p + msg->front.iov_len;
struct ceph_mds_session_head *h;
u32 op;
- u64 seq;
- unsigned long features = 0;
+ u64 seq, features = 0;
int wake = 0;
bool blacklisted = false;
@@ -3271,9 +3270,8 @@ static void handle_session(struct ceph_mds_session *session,
goto bad;
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
- ceph_decode_need(&p, end, len, bad);
- memcpy(&features, p, min_t(size_t, len, sizeof(features)));
- p += len;
+ ceph_decode_64_safe(&p, end, features, bad);
+ p += len - sizeof(features);
}
mutex_lock(&mdsc->mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4e5be79bf080..903d9edfd4bf 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -521,7 +521,7 @@ extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
static inline void ceph_mdsc_free_path(char *path, int len)
{
- if (path)
+ if (!IS_ERR_OR_NULL(path))
__putname(path - (PATH_MAX - 1 - len));
}
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index de56dee60540..19507e2fdb57 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -159,8 +159,8 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
}
if (IS_ERR(in)) {
- pr_warn("Can't lookup inode %llx (err: %ld)\n",
- realm->ino, PTR_ERR(in));
+ dout("Can't lookup inode %llx (err: %ld)\n",
+ realm->ino, PTR_ERR(in));
qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
} else {
qri->timeout = 0;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 05dd3dea684b..39b708d9d86d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1891,7 +1891,8 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list;
/*
* This lock protects the cifs_tcp_ses_list, the list of smb sessions per
* tcp session, and the list of tcon's per smb session. It also protects
- * the reference counters for the server, smb session, and tcon. Finally,
+ * the reference counters for the server, smb session, and tcon. It also
+ * protects some fields in the TCP_Server_Info struct such as dstaddr. Finally,
* changes to the tcon->tidStatus should be done while holding this lock.
* generally the locks should be taken in order tcp_ses_lock before
* tcon->open_file_lock and that before file->file_info_lock since the
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 140efc1a9374..5014a82391ff 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -594,6 +594,8 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
cifs_max_pending);
set_credits(server, server->maxReq);
server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
+ /* set up max_read for readpages check */
+ server->max_read = server->maxBuf;
/* even though we do not use raw we might as well set this
accurately, in case we ever find a need for it */
if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -755,6 +757,8 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
set_credits(server, server->maxReq);
/* probably no need to store and check maxvcs */
server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
+ /* set up max_read for readpages check */
+ server->max_read = server->maxBuf;
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf);
server->capabilities = le32_to_cpu(pSMBr->Capabilities);
@@ -2148,8 +2152,8 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
}
}
+ kref_put(&wdata2->refcount, cifs_writedata_release);
if (rc) {
- kref_put(&wdata2->refcount, cifs_writedata_release);
if (is_retryable_error(rc))
continue;
i += nr_pages;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 95b3ab0ca8c0..28268ed461b8 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -375,8 +375,10 @@ static int reconn_set_ipaddr(struct TCP_Server_Info *server)
return rc;
}
+ spin_lock(&cifs_tcp_ses_lock);
rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
strlen(ipaddr));
+ spin_unlock(&cifs_tcp_ses_lock);
kfree(ipaddr);
return !rc ? -1 : 0;
@@ -3373,6 +3375,10 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &ses->tcon_list) {
tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (tcon->dfs_path)
+ continue;
+#endif
if (!match_tcon(tcon, volume_info))
continue;
++tcon->tc_count;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0b1528edebcf..75ddce8ef456 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4060,7 +4060,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
* than it negotiated since it will refuse the read
* then.
*/
- if ((tcon->ses) && !(tcon->ses->capabilities &
+ if (!(tcon->ses->capabilities &
tcon->ses->server->vals->cap_large_files)) {
current_read_size = min_t(uint,
current_read_size, CIFSMaxBufSize);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8fbbdcdad8ff..5d2965a23730 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -61,7 +61,7 @@ static void cifs_set_ops(struct inode *inode)
}
/* check if server can support readpages */
- if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
+ if (cifs_sb_master_tcon(cifs_sb)->ses->server->max_read <
PAGE_SIZE + MAX_CIFS_HDR_SIZE)
inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
else
@@ -730,7 +730,7 @@ static __u64 simple_hashstr(const char *str)
* cifs_backup_query_path_info - SMB1 fallback code to get ino
*
* Fallback code to get file metadata when we don't have access to
- * @full_path (EACCESS) and have backup creds.
+ * @full_path (EACCES) and have backup creds.
*
* @data will be set to search info result buffer
* @resp_buf will be set to cifs resp buf and needs to be freed with
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index a456febd4109..550ce9020a3e 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1025,51 +1025,99 @@ int copy_path_name(char *dst, const char *src)
}
struct super_cb_data {
- struct TCP_Server_Info *server;
+ void *data;
struct super_block *sb;
};
-static void super_cb(struct super_block *sb, void *arg)
+static void tcp_super_cb(struct super_block *sb, void *arg)
{
- struct super_cb_data *d = arg;
+ struct super_cb_data *sd = arg;
+ struct TCP_Server_Info *server = sd->data;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
- if (d->sb)
+ if (sd->sb)
return;
cifs_sb = CIFS_SB(sb);
tcon = cifs_sb_master_tcon(cifs_sb);
- if (tcon->ses->server == d->server)
- d->sb = sb;
+ if (tcon->ses->server == server)
+ sd->sb = sb;
}
-struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server)
+static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void *),
+ void *data)
{
- struct super_cb_data d = {
- .server = server,
+ struct super_cb_data sd = {
+ .data = data,
.sb = NULL,
};
- iterate_supers_type(&cifs_fs_type, super_cb, &d);
+ iterate_supers_type(&cifs_fs_type, f, &sd);
- if (unlikely(!d.sb))
- return ERR_PTR(-ENOENT);
+ if (!sd.sb)
+ return ERR_PTR(-EINVAL);
/*
* Grab an active reference in order to prevent automounts (DFS links)
* of expiring and then freeing up our cifs superblock pointer while
* we're doing failover.
*/
- cifs_sb_active(d.sb);
- return d.sb;
+ cifs_sb_active(sd.sb);
+ return sd.sb;
}
-void cifs_put_tcp_super(struct super_block *sb)
+static void __cifs_put_super(struct super_block *sb)
{
if (!IS_ERR_OR_NULL(sb))
cifs_sb_deactive(sb);
}
+struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server)
+{
+ return __cifs_get_super(tcp_super_cb, server);
+}
+
+void cifs_put_tcp_super(struct super_block *sb)
+{
+ __cifs_put_super(sb);
+}
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static void tcon_super_cb(struct super_block *sb, void *arg)
+{
+ struct super_cb_data *sd = arg;
+ struct cifs_tcon *tcon = sd->data;
+ struct cifs_sb_info *cifs_sb;
+
+ if (sd->sb)
+ return;
+
+ cifs_sb = CIFS_SB(sb);
+ if (tcon->dfs_path && cifs_sb->origin_fullpath &&
+ !strcasecmp(tcon->dfs_path, cifs_sb->origin_fullpath))
+ sd->sb = sb;
+}
+
+static inline struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon)
+{
+ return __cifs_get_super(tcon_super_cb, tcon);
+}
+
+static inline void cifs_put_tcon_super(struct super_block *sb)
+{
+ __cifs_put_super(sb);
+}
+#else
+static inline struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void cifs_put_tcon_super(struct super_block *sb)
+{
+}
+#endif
+
int update_super_prepath(struct cifs_tcon *tcon, const char *prefix,
size_t prefix_len)
{
@@ -1077,7 +1125,7 @@ int update_super_prepath(struct cifs_tcon *tcon, const char *prefix,
struct cifs_sb_info *cifs_sb;
int rc = 0;
- sb = cifs_get_tcp_super(tcon->ses->server);
+ sb = cifs_get_tcon_super(tcon);
if (IS_ERR(sb))
return PTR_ERR(sb);
@@ -1099,6 +1147,6 @@ int update_super_prepath(struct cifs_tcon *tcon, const char *prefix,
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
out:
- cifs_put_tcp_super(sb);
+ cifs_put_tcon_super(sb);
return rc;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index b36c46f48705..f829f4165d38 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -687,6 +687,11 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
+ if (!server->ops->new_lease_key)
+ return -EIO;
+
+ server->ops->new_lease_key(pfid);
+
memset(rqst, 0, sizeof(rqst));
resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 47d3e382ecaa..b30aa3cdd845 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1552,6 +1552,21 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
}
rc = SMB2_sess_establish_session(sess_data);
+#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
+ if (ses->server->dialect < SMB30_PROT_ID) {
+ cifs_dbg(VFS, "%s: dumping generated SMB2 session keys\n", __func__);
+ /*
+ * The session id is opaque in terms of endianness, so we can't
+ * print it as a long long. we dump it as we got it on the wire
+ */
+ cifs_dbg(VFS, "Session Id %*ph\n", (int)sizeof(ses->Suid),
+ &ses->Suid);
+ cifs_dbg(VFS, "Session Key %*ph\n",
+ SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response);
+ cifs_dbg(VFS, "Signing Key %*ph\n",
+ SMB3_SIGN_KEY_SIZE, ses->auth_key.response);
+ }
+#endif
out:
kfree(ntlmssp_blob);
SMB2_sess_free_buffer(sess_data);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 1a6c227ada8f..c0348e3b1695 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -660,8 +660,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return rc;
if (memcmp(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE)) {
- dump_stack();
- cifs_dbg(VFS, "sign fail cmd 0x%x message id 0x%llx\n", shdr->Command, shdr->MessageId);
+ cifs_dbg(VFS, "sign fail cmd 0x%x message id 0x%llx\n",
+ shdr->Command, shdr->MessageId);
return -EACCES;
} else
return 0;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index aaad4ca1217e..13a087bc816b 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -17,6 +17,8 @@
#include <linux/elfcore-compat.h>
#include <linux/time.h>
+#define ELF_COMPAT 1
+
/*
* Rename the basic ELF layout types to refer to the 32-bit class of files.
*/
@@ -28,11 +30,13 @@
#undef elf_shdr
#undef elf_note
#undef elf_addr_t
+#undef ELF_GNU_PROPERTY_ALIGN
#define elfhdr elf32_hdr
#define elf_phdr elf32_phdr
#define elf_shdr elf32_shdr
#define elf_note elf32_note
#define elf_addr_t Elf32_Addr
+#define ELF_GNU_PROPERTY_ALIGN ELF32_GNU_PROPERTY_ALIGN
/*
* Some data types as stored in coredump.
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index cf7b7e1d5bd7..cb733652ecca 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1519,6 +1519,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
spin_lock(&configfs_dirent_lock);
configfs_detach_rollback(dentry);
spin_unlock(&configfs_dirent_lock);
+ config_item_put(parent_item);
return -EINTR;
}
frag->frag_dead = true;
diff --git a/fs/coredump.c b/fs/coredump.c
index f8296a82d01d..478a0d810136 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -211,6 +211,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
return -ENOMEM;
(*argv)[(*argc)++] = 0;
++pat_ptr;
+ if (!(*pat_ptr))
+ return -ENOMEM;
}
/* Repeat as long as we have more pattern to process and more output
@@ -786,6 +788,14 @@ void do_coredump(const kernel_siginfo_t *siginfo)
if (displaced)
put_files_struct(displaced);
if (!dump_interrupted()) {
+ /*
+ * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
+ * have this set to NULL.
+ */
+ if (!cprm.file) {
+ pr_info("Core dump to |%s disabled\n", cn.corename);
+ goto close_fail;
+ }
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
file_end_write(cprm.file);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 1ecaac7ee3cb..ed015cb66c7c 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -54,6 +54,7 @@ struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
/**
* fscrypt_free_bounce_page() - free a ciphertext bounce page
+ * @bounce_page: the bounce page to free, or NULL
*
* Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks(),
* or by fscrypt_alloc_bounce_page() directly.
@@ -76,8 +77,12 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
memset(iv, 0, ci->ci_mode->ivsize);
if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
- WARN_ON_ONCE((u32)lblk_num != lblk_num);
+ WARN_ON_ONCE(lblk_num > U32_MAX);
+ WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX);
lblk_num |= (u64)ci->ci_inode->i_ino << 32;
+ } else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
+ WARN_ON_ONCE(lblk_num > U32_MAX);
+ lblk_num = (u32)(ci->ci_hashed_ino + lblk_num);
} else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
}
@@ -132,7 +137,8 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
}
/**
- * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a pagecache page
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a
+ * pagecache page
* @page: The locked pagecache page containing the block(s) to encrypt
* @len: Total size of the block(s) to encrypt. Must be a nonzero
* multiple of the filesystem's block size.
@@ -222,7 +228,8 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
/**
- * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a pagecache page
+ * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a
+ * pagecache page
* @page: The locked pagecache page containing the block(s) to decrypt
* @len: Total size of the block(s) to decrypt. Must be a nonzero
* multiple of the filesystem's block size.
@@ -346,6 +353,8 @@ void fscrypt_msg(const struct inode *inode, const char *level,
/**
* fscrypt_init() - Set up for fs encryption.
+ *
+ * Return: 0 on success; -errno on failure
*/
static int __init fscrypt_init(void)
{
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 4c212442a8f7..83ca5f1e7934 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -18,7 +18,7 @@
#include <crypto/skcipher.h>
#include "fscrypt_private.h"
-/**
+/*
* struct fscrypt_nokey_name - identifier for directory entry when key is absent
*
* When userspace lists an encrypted directory without access to the key, the
@@ -83,13 +83,8 @@ static int fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result)
tfm = prev_tfm;
}
}
- {
- SHASH_DESC_ON_STACK(desc, tfm);
- desc->tfm = tfm;
-
- return crypto_shash_digest(desc, data, data_len, result);
- }
+ return crypto_shash_tfm_digest(tfm, data, data_len, result);
}
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
@@ -105,9 +100,12 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
/**
* fscrypt_fname_encrypt() - encrypt a filename
- *
- * The output buffer must be at least as large as the input buffer.
- * Any extra space is filled with NUL padding before encryption.
+ * @inode: inode of the parent directory (for regular filenames)
+ * or of the symlink (for symlink targets)
+ * @iname: the filename to encrypt
+ * @out: (output) the encrypted filename
+ * @olen: size of the encrypted filename. It must be at least @iname->len.
+ * Any extra space is filled with NUL padding before encryption.
*
* Return: 0 on success, -errno on failure
*/
@@ -157,8 +155,11 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
/**
* fname_decrypt() - decrypt a filename
- *
- * The caller must have allocated sufficient memory for the @oname string.
+ * @inode: inode of the parent directory (for regular filenames)
+ * or of the symlink (for symlink targets)
+ * @iname: the encrypted filename to decrypt
+ * @oname: (output) the decrypted filename. The caller must have allocated
+ * enough space for this, e.g. using fscrypt_fname_alloc_buffer().
*
* Return: 0 on success, -errno on failure
*/
@@ -206,7 +207,10 @@ static const char lookup_table[65] =
#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
/**
- * base64_encode() -
+ * base64_encode() - base64-encode some bytes
+ * @src: the bytes to encode
+ * @len: number of bytes to encode
+ * @dst: (output) the base64-encoded string. Not NUL-terminated.
*
* Encodes the input string using characters from the set [A-Za-z0-9+,].
* The encoded string is roughly 4/3 times the size of the input string.
@@ -272,7 +276,12 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
}
/**
- * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames
+ * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
+ * @inode: inode of the parent directory (for regular filenames)
+ * or of the symlink (for symlink targets)
+ * @max_encrypted_len: maximum length of encrypted filenames the buffer will be
+ * used to present
+ * @crypto_str: (output) buffer to allocate
*
* Allocate a buffer that is large enough to hold any decrypted or encoded
* filename (null-terminated), for the given maximum encrypted filename length.
@@ -297,9 +306,10 @@ int fscrypt_fname_alloc_buffer(const struct inode *inode,
EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);
/**
- * fscrypt_fname_free_buffer - free the buffer for presented filenames
+ * fscrypt_fname_free_buffer() - free a buffer for presented filenames
+ * @crypto_str: the buffer to free
*
- * Free the buffer allocated by fscrypt_fname_alloc_buffer().
+ * Free a buffer that was allocated by fscrypt_fname_alloc_buffer().
*/
void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
{
@@ -311,10 +321,19 @@ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
EXPORT_SYMBOL(fscrypt_fname_free_buffer);
/**
- * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user
- * space
- *
- * The caller must have allocated sufficient memory for the @oname string.
+ * fscrypt_fname_disk_to_usr() - convert an encrypted filename to
+ * user-presentable form
+ * @inode: inode of the parent directory (for regular filenames)
+ * or of the symlink (for symlink targets)
+ * @hash: first part of the name's dirhash, if applicable. This only needs to
+ * be provided if the filename is located in an indexed directory whose
+ * encryption key may be unavailable. Not needed for symlink targets.
+ * @minor_hash: second part of the name's dirhash, if applicable
+ * @iname: encrypted filename to convert. May also be "." or "..", which
+ * aren't actually encrypted.
+ * @oname: output buffer for the user-presentable filename. The caller must
+ * have allocated enough space for this, e.g. using
+ * fscrypt_fname_alloc_buffer().
*
* If the key is available, we'll decrypt the disk name. Otherwise, we'll
* encode it for presentation in fscrypt_nokey_name format.
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index dbced2937ec8..eb7fcd2b7fb8 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -43,7 +43,7 @@ struct fscrypt_context_v2 {
u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
};
-/**
+/*
* fscrypt_context - the encryption context of an inode
*
* This is the on-disk equivalent of an fscrypt_policy, stored alongside each
@@ -157,7 +157,7 @@ fscrypt_policy_flags(const union fscrypt_policy *policy)
BUG();
}
-/**
+/*
* For encrypted symlinks, the ciphertext length is stored at the beginning
* of the string in little-endian format.
*/
@@ -222,6 +222,9 @@ struct fscrypt_info {
/* This inode's nonce, copied from the fscrypt_context */
u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+
+ /* Hashed inode number. Only set for IV_INO_LBLK_32 */
+ u32 ci_hashed_ino;
};
typedef enum {
@@ -231,15 +234,14 @@ typedef enum {
/* crypto.c */
extern struct kmem_cache *fscrypt_info_cachep;
-extern int fscrypt_initialize(unsigned int cop_flags);
-extern int fscrypt_crypt_block(const struct inode *inode,
- fscrypt_direction_t rw, u64 lblk_num,
- struct page *src_page, struct page *dest_page,
- unsigned int len, unsigned int offs,
- gfp_t gfp_flags);
-extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
-
-extern void __printf(3, 4) __cold
+int fscrypt_initialize(unsigned int cop_flags);
+int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
+ u64 lblk_num, struct page *src_page,
+ struct page *dest_page, unsigned int len,
+ unsigned int offs, gfp_t gfp_flags);
+struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
+
+void __printf(3, 4) __cold
fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);
#define fscrypt_warn(inode, fmt, ...) \
@@ -264,12 +266,10 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
const struct fscrypt_info *ci);
/* fname.c */
-extern int fscrypt_fname_encrypt(const struct inode *inode,
- const struct qstr *iname,
- u8 *out, unsigned int olen);
-extern bool fscrypt_fname_encrypted_size(const struct inode *inode,
- u32 orig_len, u32 max_len,
- u32 *encrypted_len_ret);
+int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
+ u8 *out, unsigned int olen);
+bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+ u32 max_len, u32 *encrypted_len_ret);
extern const struct dentry_operations fscrypt_d_ops;
/* hkdf.c */
@@ -278,8 +278,8 @@ struct fscrypt_hkdf {
struct crypto_shash *hmac_tfm;
};
-extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
- unsigned int master_key_size);
+int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
+ unsigned int master_key_size);
/*
* The list of contexts in which fscrypt uses HKDF. These values are used as
@@ -293,12 +293,14 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
#define HKDF_CONTEXT_DIRECT_KEY 3
#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4
#define HKDF_CONTEXT_DIRHASH_KEY 5
+#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6
+#define HKDF_CONTEXT_INODE_HASH_KEY 7
-extern int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
- const u8 *info, unsigned int infolen,
- u8 *okm, unsigned int okmlen);
+int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
+ const u8 *info, unsigned int infolen,
+ u8 *okm, unsigned int okmlen);
-extern void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
+void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
/* keyring.c */
@@ -389,14 +391,17 @@ struct fscrypt_master_key {
struct list_head mk_decrypted_inodes;
spinlock_t mk_decrypted_inodes_lock;
- /* Crypto API transforms for DIRECT_KEY policies, allocated on-demand */
- struct crypto_skcipher *mk_direct_tfms[__FSCRYPT_MODE_MAX + 1];
-
/*
- * Crypto API transforms for filesystem-layer implementation of
- * IV_INO_LBLK_64 policies, allocated on-demand.
+ * Per-mode encryption keys for the various types of encryption policies
+ * that use them. Allocated and derived on-demand.
*/
- struct crypto_skcipher *mk_iv_ino_lblk_64_tfms[__FSCRYPT_MODE_MAX + 1];
+ struct crypto_skcipher *mk_direct_keys[__FSCRYPT_MODE_MAX + 1];
+ struct crypto_skcipher *mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1];
+ struct crypto_skcipher *mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1];
+
+ /* Hash key for inode numbers. Initialized only when needed. */
+ siphash_key_t mk_ino_hash_key;
+ bool mk_ino_hash_key_initialized;
} __randomize_layout;
@@ -436,14 +441,17 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
return 0;
}
-extern struct key *
+struct key *
fscrypt_find_master_key(struct super_block *sb,
const struct fscrypt_key_specifier *mk_spec);
-extern int fscrypt_verify_key_added(struct super_block *sb,
- const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
+int fscrypt_add_test_dummy_key(struct super_block *sb,
+ struct fscrypt_key_specifier *key_spec);
+
+int fscrypt_verify_key_added(struct super_block *sb,
+ const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
-extern int __init fscrypt_init_keyring(void);
+int __init fscrypt_init_keyring(void);
/* keysetup.c */
@@ -457,33 +465,32 @@ struct fscrypt_mode {
extern struct fscrypt_mode fscrypt_modes[];
-extern struct crypto_skcipher *
-fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
- const struct inode *inode);
+struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode,
+ const u8 *raw_key,
+ const struct inode *inode);
-extern int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci,
- const u8 *raw_key);
+int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
-extern int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
- const struct fscrypt_master_key *mk);
+int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk);
/* keysetup_v1.c */
-extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
+void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
+
+int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
+ const u8 *raw_master_key);
-extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
- const u8 *raw_master_key);
+int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci);
-extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
- struct fscrypt_info *ci);
/* policy.c */
-extern bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
- const union fscrypt_policy *policy2);
-extern bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
- const struct inode *inode);
-extern int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
- const union fscrypt_context *ctx_u,
- int ctx_size);
+bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
+ const union fscrypt_policy *policy2);
+bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
+ const struct inode *inode);
+int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
+ const union fscrypt_context *ctx_u,
+ int ctx_size);
#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index efb95bd19a89..0cba7928446d 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -44,17 +44,13 @@ static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
unsigned int ikmlen, u8 prk[HKDF_HASHLEN])
{
static const u8 default_salt[HKDF_HASHLEN];
- SHASH_DESC_ON_STACK(desc, hmac_tfm);
int err;
err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN);
if (err)
return err;
- desc->tfm = hmac_tfm;
- err = crypto_shash_digest(desc, ikm, ikmlen, prk);
- shash_desc_zero(desc);
- return err;
+ return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk);
}
/*
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 5ef861742921..09fb8aa0f2e9 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -10,7 +10,7 @@
#include "fscrypt_private.h"
/**
- * fscrypt_file_open - prepare to open a possibly-encrypted regular file
+ * fscrypt_file_open() - prepare to open a possibly-encrypted regular file
* @inode: the inode being opened
* @filp: the struct file being set up
*
@@ -262,7 +262,7 @@ err_free_sd:
EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink);
/**
- * fscrypt_get_symlink - get the target of an encrypted symlink
+ * fscrypt_get_symlink() - get the target of an encrypted symlink
* @inode: the symlink inode
* @caddr: the on-disk contents of the symlink
* @max_size: size of @caddr buffer
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index ab41b25d4fa1..e24eb48bfbe1 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -20,6 +20,7 @@
#include <crypto/skcipher.h>
#include <linux/key-type.h>
+#include <linux/random.h>
#include <linux/seq_file.h>
#include "fscrypt_private.h"
@@ -44,8 +45,9 @@ static void free_master_key(struct fscrypt_master_key *mk)
wipe_master_key_secret(&mk->mk_secret);
for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) {
- crypto_free_skcipher(mk->mk_direct_tfms[i]);
- crypto_free_skcipher(mk->mk_iv_ino_lblk_64_tfms[i]);
+ crypto_free_skcipher(mk->mk_direct_keys[i]);
+ crypto_free_skcipher(mk->mk_iv_ino_lblk_64_keys[i]);
+ crypto_free_skcipher(mk->mk_iv_ino_lblk_32_keys[i]);
}
key_put(mk->mk_users);
@@ -424,9 +426,9 @@ static int add_existing_master_key(struct fscrypt_master_key *mk,
return 0;
}
-static int add_master_key(struct super_block *sb,
- struct fscrypt_master_key_secret *secret,
- const struct fscrypt_key_specifier *mk_spec)
+static int do_add_master_key(struct super_block *sb,
+ struct fscrypt_master_key_secret *secret,
+ const struct fscrypt_key_specifier *mk_spec)
{
static DEFINE_MUTEX(fscrypt_add_key_mutex);
struct key *key;
@@ -465,6 +467,35 @@ out_unlock:
return err;
}
+static int add_master_key(struct super_block *sb,
+ struct fscrypt_master_key_secret *secret,
+ struct fscrypt_key_specifier *key_spec)
+{
+ int err;
+
+ if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) {
+ err = fscrypt_init_hkdf(&secret->hkdf, secret->raw,
+ secret->size);
+ if (err)
+ return err;
+
+ /*
+ * Now that the HKDF context is initialized, the raw key is no
+ * longer needed.
+ */
+ memzero_explicit(secret->raw, secret->size);
+
+ /* Calculate the key identifier */
+ err = fscrypt_hkdf_expand(&secret->hkdf,
+ HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0,
+ key_spec->u.identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE);
+ if (err)
+ return err;
+ }
+ return do_add_master_key(sb, secret, key_spec);
+}
+
static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep)
{
const struct fscrypt_provisioning_key_payload *payload = prep->data;
@@ -609,6 +640,15 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg)
if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
return -EINVAL;
+ /*
+ * Only root can add keys that are identified by an arbitrary descriptor
+ * rather than by a cryptographic hash --- since otherwise a malicious
+ * user could add the wrong key.
+ */
+ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
+ !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
memset(&secret, 0, sizeof(secret));
if (arg.key_id) {
if (arg.raw_size != 0)
@@ -626,48 +666,17 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg)
goto out_wipe_secret;
}
- switch (arg.key_spec.type) {
- case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
- /*
- * Only root can add keys that are identified by an arbitrary
- * descriptor rather than by a cryptographic hash --- since
- * otherwise a malicious user could add the wrong key.
- */
- err = -EACCES;
- if (!capable(CAP_SYS_ADMIN))
- goto out_wipe_secret;
- break;
- case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
- err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size);
- if (err)
- goto out_wipe_secret;
-
- /*
- * Now that the HKDF context is initialized, the raw key is no
- * longer needed.
- */
- memzero_explicit(secret.raw, secret.size);
-
- /* Calculate the key identifier and return it to userspace. */
- err = fscrypt_hkdf_expand(&secret.hkdf,
- HKDF_CONTEXT_KEY_IDENTIFIER,
- NULL, 0, arg.key_spec.u.identifier,
- FSCRYPT_KEY_IDENTIFIER_SIZE);
- if (err)
- goto out_wipe_secret;
- err = -EFAULT;
- if (copy_to_user(uarg->key_spec.u.identifier,
- arg.key_spec.u.identifier,
- FSCRYPT_KEY_IDENTIFIER_SIZE))
- goto out_wipe_secret;
- break;
- default:
- WARN_ON(1);
- err = -EINVAL;
+ err = add_master_key(sb, &secret, &arg.key_spec);
+ if (err)
goto out_wipe_secret;
- }
- err = add_master_key(sb, &secret, &arg.key_spec);
+ /* Return the key identifier to userspace, if applicable */
+ err = -EFAULT;
+ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER &&
+ copy_to_user(uarg->key_spec.u.identifier, arg.key_spec.u.identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE))
+ goto out_wipe_secret;
+ err = 0;
out_wipe_secret:
wipe_master_key_secret(&secret);
return err;
@@ -675,6 +684,29 @@ out_wipe_secret:
EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key);
/*
+ * Add the key for '-o test_dummy_encryption' to the filesystem keyring.
+ *
+ * Use a per-boot random key to prevent people from misusing this option.
+ */
+int fscrypt_add_test_dummy_key(struct super_block *sb,
+ struct fscrypt_key_specifier *key_spec)
+{
+ static u8 test_key[FSCRYPT_MAX_KEY_SIZE];
+ struct fscrypt_master_key_secret secret;
+ int err;
+
+ get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE);
+
+ memset(&secret, 0, sizeof(secret));
+ secret.size = FSCRYPT_MAX_KEY_SIZE;
+ memcpy(secret.raw, test_key, FSCRYPT_MAX_KEY_SIZE);
+
+ err = add_master_key(sb, &secret, key_spec);
+ wipe_master_key_secret(&secret);
+ return err;
+}
+
+/*
* Verify that the current user has added a master key with the given identifier
* (returns -ENOKEY if not). This is needed to prevent a user from encrypting
* their files using some other user's key which they don't actually know.
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 302375e9f719..1129adfa097d 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -46,6 +46,8 @@ struct fscrypt_mode fscrypt_modes[] = {
},
};
+static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex);
+
static struct fscrypt_mode *
select_encryption_mode(const union fscrypt_policy *policy,
const struct inode *inode)
@@ -130,7 +132,7 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci,
const struct super_block *sb = inode->i_sb;
struct fscrypt_mode *mode = ci->ci_mode;
const u8 mode_num = mode - fscrypt_modes;
- struct crypto_skcipher *tfm, *prev_tfm;
+ struct crypto_skcipher *tfm;
u8 mode_key[FSCRYPT_MAX_KEY_SIZE];
u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)];
unsigned int hkdf_infolen = 0;
@@ -139,10 +141,17 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci,
if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX))
return -EINVAL;
- /* pairs with cmpxchg() below */
+ /* pairs with smp_store_release() below */
tfm = READ_ONCE(tfms[mode_num]);
- if (likely(tfm != NULL))
- goto done;
+ if (likely(tfm != NULL)) {
+ ci->ci_ctfm = tfm;
+ return 0;
+ }
+
+ mutex_lock(&fscrypt_mode_key_setup_mutex);
+
+ if (tfms[mode_num])
+ goto done_unlock;
BUILD_BUG_ON(sizeof(mode_num) != 1);
BUILD_BUG_ON(sizeof(sb->s_uuid) != 16);
@@ -157,21 +166,21 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci,
hkdf_context, hkdf_info, hkdf_infolen,
mode_key, mode->keysize);
if (err)
- return err;
+ goto out_unlock;
tfm = fscrypt_allocate_skcipher(mode, mode_key, inode);
memzero_explicit(mode_key, mode->keysize);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- /* pairs with READ_ONCE() above */
- prev_tfm = cmpxchg(&tfms[mode_num], NULL, tfm);
- if (prev_tfm != NULL) {
- crypto_free_skcipher(tfm);
- tfm = prev_tfm;
+ if (IS_ERR(tfm)) {
+ err = PTR_ERR(tfm);
+ goto out_unlock;
}
-done:
+ /* pairs with READ_ONCE() above */
+ smp_store_release(&tfms[mode_num], tfm);
+done_unlock:
ci->ci_ctfm = tfm;
- return 0;
+ err = 0;
+out_unlock:
+ mutex_unlock(&fscrypt_mode_key_setup_mutex);
+ return err;
}
int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
@@ -189,6 +198,43 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
return 0;
}
+static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
+ struct fscrypt_master_key *mk)
+{
+ int err;
+
+ err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys,
+ HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true);
+ if (err)
+ return err;
+
+ /* pairs with smp_store_release() below */
+ if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) {
+
+ mutex_lock(&fscrypt_mode_key_setup_mutex);
+
+ if (mk->mk_ino_hash_key_initialized)
+ goto unlock;
+
+ err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
+ HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0,
+ (u8 *)&mk->mk_ino_hash_key,
+ sizeof(mk->mk_ino_hash_key));
+ if (err)
+ goto unlock;
+ /* pairs with smp_load_acquire() above */
+ smp_store_release(&mk->mk_ino_hash_key_initialized, true);
+unlock:
+ mutex_unlock(&fscrypt_mode_key_setup_mutex);
+ if (err)
+ return err;
+ }
+
+ ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
+ &mk->mk_ino_hash_key);
+ return 0;
+}
+
static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
struct fscrypt_master_key *mk)
{
@@ -203,7 +249,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
* encryption key. This ensures that the master key is
* consistently used only for HKDF, avoiding key reuse issues.
*/
- err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_tfms,
+ err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys,
HKDF_CONTEXT_DIRECT_KEY, false);
} else if (ci->ci_policy.v2.flags &
FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
@@ -211,11 +257,14 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
* IV_INO_LBLK_64: encryption keys are derived from (master_key,
* mode_num, filesystem_uuid), and inode number is included in
* the IVs. This format is optimized for use with inline
- * encryption hardware compliant with the UFS or eMMC standards.
+ * encryption hardware compliant with the UFS standard.
*/
- err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms,
+ err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys,
HKDF_CONTEXT_IV_INO_LBLK_64_KEY,
true);
+ } else if (ci->ci_policy.v2.flags &
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
+ err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk);
} else {
u8 derived_key[FSCRYPT_MAX_KEY_SIZE];
@@ -395,21 +444,18 @@ int fscrypt_get_encryption_info(struct inode *inode)
res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
if (res < 0) {
- if (!fscrypt_dummy_context_enabled(inode) ||
- IS_ENCRYPTED(inode)) {
+ const union fscrypt_context *dummy_ctx =
+ fscrypt_get_dummy_context(inode->i_sb);
+
+ if (IS_ENCRYPTED(inode) || !dummy_ctx) {
fscrypt_warn(inode,
"Error %d getting encryption context",
res);
return res;
}
/* Fake up a context for an unencrypted directory */
- memset(&ctx, 0, sizeof(ctx));
- ctx.version = FSCRYPT_CONTEXT_V1;
- ctx.v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
- ctx.v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memset(ctx.v1.master_key_descriptor, 0x42,
- FSCRYPT_KEY_DESCRIPTOR_SIZE);
- res = sizeof(ctx.v1);
+ res = fscrypt_context_size(dummy_ctx);
+ memcpy(&ctx, dummy_ctx, res);
}
crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
@@ -475,7 +521,8 @@ out:
EXPORT_SYMBOL(fscrypt_get_encryption_info);
/**
- * fscrypt_put_encryption_info - free most of an inode's fscrypt data
+ * fscrypt_put_encryption_info() - free most of an inode's fscrypt data
+ * @inode: an inode being evicted
*
* Free the inode's fscrypt_info. Filesystems must call this when the inode is
* being evicted. An RCU grace period need not have elapsed yet.
@@ -488,7 +535,8 @@ void fscrypt_put_encryption_info(struct inode *inode)
EXPORT_SYMBOL(fscrypt_put_encryption_info);
/**
- * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay
+ * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay
+ * @inode: an inode being freed
*
* Free the inode's cached decrypted symlink target, if any. Filesystems must
* call this after an RCU grace period, just before they free the inode.
@@ -503,7 +551,8 @@ void fscrypt_free_inode(struct inode *inode)
EXPORT_SYMBOL(fscrypt_free_inode);
/**
- * fscrypt_drop_inode - check whether the inode's master key has been removed
+ * fscrypt_drop_inode() - check whether the inode's master key has been removed
+ * @inode: an inode being considered for eviction
*
* Filesystems supporting fscrypt must call this from their ->drop_inode()
* method so that encrypted inodes are evicted as soon as they're no longer in
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 10ccf945020c..d23ff162c78b 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -11,12 +11,15 @@
*/
#include <linux/random.h>
+#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/mount.h>
#include "fscrypt_private.h"
/**
- * fscrypt_policies_equal - check whether two encryption policies are the same
+ * fscrypt_policies_equal() - check whether two encryption policies are the same
+ * @policy1: the first policy
+ * @policy2: the second policy
*
* Return: %true if equal, else %false
*/
@@ -66,18 +69,14 @@ static bool supported_direct_key_modes(const struct inode *inode,
return true;
}
-static bool supported_iv_ino_lblk_64_policy(
- const struct fscrypt_policy_v2 *policy,
- const struct inode *inode)
+static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
+ const struct inode *inode,
+ const char *type,
+ int max_ino_bits, int max_lblk_bits)
{
struct super_block *sb = inode->i_sb;
int ino_bits = 64, lblk_bits = 64;
- if (policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
- fscrypt_warn(inode,
- "The DIRECT_KEY and IV_INO_LBLK_64 flags are mutually exclusive");
- return false;
- }
/*
* It's unsafe to include inode numbers in the IVs if the filesystem can
* potentially renumber inodes, e.g. via filesystem shrinking.
@@ -85,16 +84,22 @@ static bool supported_iv_ino_lblk_64_policy(
if (!sb->s_cop->has_stable_inodes ||
!sb->s_cop->has_stable_inodes(sb)) {
fscrypt_warn(inode,
- "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't have stable inode numbers",
- sb->s_id);
+ "Can't use %s policy on filesystem '%s' because it doesn't have stable inode numbers",
+ type, sb->s_id);
return false;
}
if (sb->s_cop->get_ino_and_lblk_bits)
sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
- if (ino_bits > 32 || lblk_bits > 32) {
+ if (ino_bits > max_ino_bits) {
+ fscrypt_warn(inode,
+ "Can't use %s policy on filesystem '%s' because its inode numbers are too long",
+ type, sb->s_id);
+ return false;
+ }
+ if (lblk_bits > max_lblk_bits) {
fscrypt_warn(inode,
- "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't use 32-bit inode and block numbers",
- sb->s_id);
+ "Can't use %s policy on filesystem '%s' because its block numbers are too long",
+ type, sb->s_id);
return false;
}
return true;
@@ -137,6 +142,8 @@ static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy,
static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
const struct inode *inode)
{
+ int count = 0;
+
if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
policy->filenames_encryption_mode)) {
fscrypt_warn(inode,
@@ -152,13 +159,29 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
return false;
}
+ count += !!(policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY);
+ count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64);
+ count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32);
+ if (count > 1) {
+ fscrypt_warn(inode, "Mutually exclusive encryption flags (0x%02x)",
+ policy->flags);
+ return false;
+ }
+
if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
!supported_direct_key_modes(inode, policy->contents_encryption_mode,
policy->filenames_encryption_mode))
return false;
if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) &&
- !supported_iv_ino_lblk_64_policy(policy, inode))
+ !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64",
+ 32, 32))
+ return false;
+
+ if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
+ /* This uses hashed inode numbers, so ino_bits doesn't matter. */
+ !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32",
+ INT_MAX, 32))
return false;
if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
@@ -170,7 +193,9 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
}
/**
- * fscrypt_supported_policy - check whether an encryption policy is supported
+ * fscrypt_supported_policy() - check whether an encryption policy is supported
+ * @policy_u: the encryption policy
+ * @inode: the inode on which the policy will be used
*
* Given an encryption policy, check whether all its encryption modes and other
* settings are supported by this kernel on the given inode. (But we don't
@@ -192,7 +217,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
}
/**
- * fscrypt_new_context_from_policy - create a new fscrypt_context from a policy
+ * fscrypt_new_context_from_policy() - create a new fscrypt_context from
+ * an fscrypt_policy
+ * @ctx_u: output context
+ * @policy_u: input policy
*
* Create an fscrypt_context for an inode that is being assigned the given
* encryption policy. A new nonce is randomly generated.
@@ -242,7 +270,11 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
}
/**
- * fscrypt_policy_from_context - convert an fscrypt_context to an fscrypt_policy
+ * fscrypt_policy_from_context() - convert an fscrypt_context to
+ * an fscrypt_policy
+ * @policy_u: output policy
+ * @ctx_u: input context
+ * @ctx_size: size of input context in bytes
*
* Given an fscrypt_context, build the corresponding fscrypt_policy.
*
@@ -354,6 +386,9 @@ static int set_encryption_policy(struct inode *inode,
policy->v2.master_key_identifier);
if (err)
return err;
+ if (policy->v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)
+ pr_warn_once("%s (pid %d) is setting an IV_INO_LBLK_32 encryption policy. This should only be used if there are certain hardware limitations.\n",
+ current->comm, current->pid);
break;
default:
WARN_ON(1);
@@ -605,3 +640,127 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
return preload ? fscrypt_get_encryption_info(child): 0;
}
EXPORT_SYMBOL(fscrypt_inherit_context);
+
+/**
+ * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption'
+ * @sb: the filesystem on which test_dummy_encryption is being specified
+ * @arg: the argument to the test_dummy_encryption option.
+ * If no argument was specified, then @arg->from == NULL.
+ * @dummy_ctx: the filesystem's current dummy context (input/output, see below)
+ *
+ * Handle the test_dummy_encryption mount option by creating a dummy encryption
+ * context, saving it in @dummy_ctx, and adding the corresponding dummy
+ * encryption key to the filesystem. If the @dummy_ctx is already set, then
+ * instead validate that it matches @arg. Don't support changing it via
+ * remount, as that is difficult to do safely.
+ *
+ * The reason we use an fscrypt_context rather than an fscrypt_policy is because
+ * we mustn't generate a new nonce each time we access a dummy-encrypted
+ * directory, as that would change the way filenames are encrypted.
+ *
+ * Return: 0 on success (dummy context set, or the same context is already set);
+ * -EEXIST if a different dummy context is already set;
+ * or another -errno value.
+ */
+int fscrypt_set_test_dummy_encryption(struct super_block *sb,
+ const substring_t *arg,
+ struct fscrypt_dummy_context *dummy_ctx)
+{
+ const char *argstr = "v2";
+ const char *argstr_to_free = NULL;
+ struct fscrypt_key_specifier key_spec = { 0 };
+ int version;
+ union fscrypt_context *ctx = NULL;
+ int err;
+
+ if (arg->from) {
+ argstr = argstr_to_free = match_strdup(arg);
+ if (!argstr)
+ return -ENOMEM;
+ }
+
+ if (!strcmp(argstr, "v1")) {
+ version = FSCRYPT_CONTEXT_V1;
+ key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
+ memset(key_spec.u.descriptor, 0x42,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
+ } else if (!strcmp(argstr, "v2")) {
+ version = FSCRYPT_CONTEXT_V2;
+ key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
+ /* key_spec.u.identifier gets filled in when adding the key */
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (dummy_ctx->ctx) {
+ /*
+ * Note: if we ever make test_dummy_encryption support
+ * specifying other encryption settings, such as the encryption
+ * modes, we'll need to compare those settings here.
+ */
+ if (dummy_ctx->ctx->version == version)
+ err = 0;
+ else
+ err = -EEXIST;
+ goto out;
+ }
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = fscrypt_add_test_dummy_key(sb, &key_spec);
+ if (err)
+ goto out;
+
+ ctx->version = version;
+ switch (ctx->version) {
+ case FSCRYPT_CONTEXT_V1:
+ ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
+ break;
+ case FSCRYPT_CONTEXT_V2:
+ ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE);
+ break;
+ default:
+ WARN_ON(1);
+ err = -EINVAL;
+ goto out;
+ }
+ dummy_ctx->ctx = ctx;
+ ctx = NULL;
+ err = 0;
+out:
+ kfree(ctx);
+ kfree(argstr_to_free);
+ return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
+
+/**
+ * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption'
+ * @seq: the seq_file to print the option to
+ * @sep: the separator character to use
+ * @sb: the filesystem whose options are being shown
+ *
+ * Show the test_dummy_encryption mount option, if it was specified.
+ * This is mainly used for /proc/mounts.
+ */
+void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
+ struct super_block *sb)
+{
+ const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb);
+
+ if (!ctx)
+ return;
+ seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version);
+}
+EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 2d357680094c..ae49a55bda00 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -506,20 +506,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
* This function creates a file in debugfs with the given name that
* contains the value of the variable @value. If the @mode variable is so
* set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_u32(const char *name, umode_t mode,
- struct dentry *parent, u32 *value)
+void debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent,
+ u32 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32,
&fops_u32_ro, &fops_u32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2c449aed1b92..0681540c48d9 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -48,18 +48,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size)
}
}
-static int ecryptfs_hash_digest(struct crypto_shash *tfm,
- char *src, int len, char *dst)
-{
- SHASH_DESC_ON_STACK(desc, tfm);
- int err;
-
- desc->tfm = tfm;
- err = crypto_shash_digest(desc, src, len, dst);
- shash_desc_zero(desc);
- return err;
-}
-
/**
* ecryptfs_calculate_md5 - calculates the md5 of @src
* @dst: Pointer to 16 bytes of allocated memory
@@ -74,11 +62,8 @@ static int ecryptfs_calculate_md5(char *dst,
struct ecryptfs_crypt_stat *crypt_stat,
char *src, int len)
{
- struct crypto_shash *tfm;
- int rc = 0;
+ int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst);
- tfm = crypt_stat->hash_tfm;
- rc = ecryptfs_hash_digest(tfm, src, len, dst);
if (rc) {
printk(KERN_ERR
"%s: Error computing crypto hash; rc = [%d]\n",
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8c596641a72b..12eebcdea9c8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1171,6 +1171,10 @@ static inline bool chain_epi_lockless(struct epitem *epi)
{
struct eventpoll *ep = epi->ep;
+ /* Fast preliminary check */
+ if (epi->next != EP_UNACTIVE_PTR)
+ return false;
+
/* Check that the same epi has not been just chained from another CPU */
if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
return false;
@@ -1237,16 +1241,12 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (epi->next == EP_UNACTIVE_PTR &&
- chain_epi_lockless(epi))
+ if (chain_epi_lockless(epi))
+ ep_pm_stay_awake_rcu(epi);
+ } else if (!ep_is_linked(epi)) {
+ /* In the usual case, add event to ready list. */
+ if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
ep_pm_stay_awake_rcu(epi);
- goto out_unlock;
- }
-
- /* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(epi) &&
- list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
- ep_pm_stay_awake_rcu(epi);
}
/*
@@ -1822,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
- bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
@@ -1867,55 +1866,75 @@ fetch_events:
*/
ep_reset_busy_poll_napi_id(ep);
- /*
- * We don't have any available event to return to the caller. We need
- * to sleep here, and we will be woken by ep_poll_callback() when events
- * become available.
- */
- if (!waiter) {
- waiter = true;
- init_waitqueue_entry(&wait, current);
+ do {
+ /*
+ * Internally init_wait() uses autoremove_wake_function(),
+ * thus wait entry is removed from the wait queue on each
+ * wakeup. Why it is important? In case of several waiters
+ * each new wakeup will hit the next waiter, giving it the
+ * chance to harvest new event. Otherwise wakeup can be
+ * lost. This is also good performance-wise, because on
+ * normal wakeup path no need to call __remove_wait_queue()
+ * explicitly, thus ep->lock is not taken, which halts the
+ * event delivery.
+ */
+ init_wait(&wait);
write_lock_irq(&ep->lock);
- __add_wait_queue_exclusive(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
- }
-
- for (;;) {
/*
- * We don't want to sleep if the ep_poll_callback() sends us
- * a wakeup in between. That's why we set the task state
- * to TASK_INTERRUPTIBLE before doing the checks.
+ * Barrierless variant, waitqueue_active() is called under
+ * the same lock on wakeup ep_poll_callback() side, so it
+ * is safe to avoid an explicit barrier.
*/
- set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_INTERRUPTIBLE);
+
/*
- * Always short-circuit for fatal signals to allow
- * threads to make a timely exit without the chance of
- * finding more events available and fetching
- * repeatedly.
+ * Do the final check under the lock. ep_scan_ready_list()
+ * plays with two lists (->rdllist and ->ovflist) and there
+ * is always a race when both lists are empty for short
+ * period of time although events are pending, so lock is
+ * important.
*/
- if (fatal_signal_pending(current)) {
- res = -EINTR;
- break;
+ eavail = ep_events_available(ep);
+ if (!eavail) {
+ if (signal_pending(current))
+ res = -EINTR;
+ else
+ __add_wait_queue_exclusive(&ep->wq, &wait);
}
+ write_unlock_irq(&ep->lock);
- eavail = ep_events_available(ep);
- if (eavail)
+ if (eavail || res)
break;
- if (signal_pending(current)) {
- res = -EINTR;
- break;
- }
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
timed_out = 1;
break;
}
- }
+
+ /* We were woken up, thus go and try to harvest some events */
+ eavail = 1;
+
+ } while (0);
__set_current_state(TASK_RUNNING);
+ if (!list_empty_careful(&wait.entry)) {
+ write_lock_irq(&ep->lock);
+ __remove_wait_queue(&ep->wq, &wait);
+ write_unlock_irq(&ep->lock);
+ }
+
send_events:
+ if (fatal_signal_pending(current)) {
+ /*
+ * Always short-circuit for fatal signals to allow
+ * threads to make a timely exit without the chance of
+ * finding more events available and fetching
+ * repeatedly.
+ */
+ res = -EINTR;
+ }
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
@@ -1925,12 +1944,6 @@ send_events:
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
- if (waiter) {
- write_lock_irq(&ep->lock);
- __remove_wait_queue(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
- }
-
return res;
}
diff --git a/fs/exec.c b/fs/exec.c
index 06b4c550af5d..2c465119affc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1317,6 +1317,8 @@ int flush_old_exec(struct linux_binprm * bprm)
*/
set_mm_exe_file(bprm->mm, bprm->file);
+ would_dump(bprm, bprm->file);
+
/*
* Release all of the old mmap stuff
*/
@@ -1876,8 +1878,6 @@ static int __do_execve_file(int fd, struct filename *filename,
if (retval < 0)
goto out;
- would_dump(bprm, bprm->file);
-
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 6a04cc02565a..6774a5a6ded8 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -91,7 +91,6 @@ static int exfat_allocate_bitmap(struct super_block *sb,
}
}
- sbi->pbr_bh = NULL;
return 0;
}
@@ -137,8 +136,6 @@ void exfat_free_bitmap(struct exfat_sb_info *sbi)
{
int i;
- brelse(sbi->pbr_bh);
-
for (i = 0; i < sbi->map_sectors; i++)
__brelse(sbi->vol_amap[i]);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 67d4e46fb810..d67fb8a6f770 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -507,6 +507,7 @@ void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...)
__printf(3, 4) __cold;
void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 tz, __le16 time, __le16 date, u8 time_ms);
+void exfat_truncate_atime(struct timespec64 *ts);
void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 *tz, __le16 *time, __le16 *date, u8 *time_ms);
unsigned short exfat_calc_chksum_2byte(void *data, int len,
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 483f683757aa..c9db8eb0cfc3 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -273,6 +273,7 @@ int exfat_getattr(const struct path *path, struct kstat *stat,
struct exfat_inode_info *ei = EXFAT_I(inode);
generic_fillattr(inode, stat);
+ exfat_truncate_atime(&stat->atime);
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = ei->i_crtime.tv_sec;
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
@@ -339,6 +340,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr)
}
setattr_copy(inode, attr);
+ exfat_truncate_atime(&inode->i_atime);
mark_inode_dirty(inode);
out:
@@ -346,12 +348,13 @@ out:
}
const struct file_operations exfat_file_operations = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
- .fsync = generic_file_fsync,
- .splice_read = generic_file_splice_read,
+ .llseek = generic_file_llseek,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
+ .mmap = generic_file_mmap,
+ .fsync = generic_file_fsync,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
};
const struct inode_operations exfat_file_inode_operations = {
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index 14a3300848f6..ebd2cbe3cbc1 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -88,7 +88,8 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
if (time_ms) {
ts->tv_sec += time_ms / 100;
ts->tv_nsec = (time_ms % 100) * 10 * NSEC_PER_MSEC;
- }
+ } else
+ ts->tv_nsec = 0;
if (tz & EXFAT_TZ_VALID)
/* Adjust timezone to UTC0. */
@@ -124,6 +125,17 @@ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
*tz = EXFAT_TZ_VALID;
}
+/*
+ * The timestamp for access_time has double seconds granularity.
+ * (There is no 10msIncrement field for access_time unlike create/modify_time)
+ * atime also has only a 2-second resolution.
+ */
+void exfat_truncate_atime(struct timespec64 *ts)
+{
+ ts->tv_sec = round_down(ts->tv_sec, 2);
+ ts->tv_nsec = 0;
+}
+
unsigned short exfat_calc_chksum_2byte(void *data, int len,
unsigned short chksum, int type)
{
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index a8681d91f569..a2659a8a68a1 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -595,6 +595,7 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode_inc_iversion(inode);
inode->i_mtime = inode->i_atime = inode->i_ctime =
EXFAT_I(inode)->i_crtime = current_time(inode);
+ exfat_truncate_atime(&inode->i_atime);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -691,6 +692,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
exfat_fs_error(sb,
"non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)",
i_size_read(dir), ei->dir.dir, ei->entry);
+ kfree(es);
return -EIO;
}
@@ -854,6 +856,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
inode_inc_iversion(dir);
dir->i_mtime = dir->i_atime = current_time(dir);
+ exfat_truncate_atime(&dir->i_atime);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -861,6 +864,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
clear_nlink(inode);
inode->i_mtime = inode->i_atime = current_time(inode);
+ exfat_truncate_atime(&inode->i_atime);
exfat_unhash_inode(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
unlock:
@@ -903,6 +907,7 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode_inc_iversion(inode);
inode->i_mtime = inode->i_atime = inode->i_ctime =
EXFAT_I(inode)->i_crtime = current_time(inode);
+ exfat_truncate_atime(&inode->i_atime);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -1019,6 +1024,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
inode_inc_iversion(dir);
dir->i_mtime = dir->i_atime = current_time(dir);
+ exfat_truncate_atime(&dir->i_atime);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -1027,6 +1033,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
clear_nlink(inode);
inode->i_mtime = inode->i_atime = current_time(inode);
+ exfat_truncate_atime(&inode->i_atime);
exfat_unhash_inode(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
unlock:
@@ -1387,6 +1394,7 @@ static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry,
inode_inc_iversion(new_dir);
new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime =
EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
+ exfat_truncate_atime(&new_dir->i_atime);
if (IS_DIRSYNC(new_dir))
exfat_sync_inode(new_dir);
else
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 16ed202ef527..a846ff555656 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -49,6 +49,7 @@ static void exfat_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
exfat_set_vol_flags(sb, VOL_CLEAN);
exfat_free_bitmap(sbi);
+ brelse(sbi->pbr_bh);
mutex_unlock(&sbi->s_lock);
call_rcu(&sbi->rcu, exfat_delayed_free);
@@ -100,7 +101,7 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct pbr64 *bpb;
+ struct pbr64 *bpb = (struct pbr64 *)sbi->pbr_bh->b_data;
bool sync = 0;
/* flags are not changed */
@@ -115,15 +116,6 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
if (sb_rdonly(sb))
return 0;
- if (!sbi->pbr_bh) {
- sbi->pbr_bh = sb_bread(sb, 0);
- if (!sbi->pbr_bh) {
- exfat_msg(sb, KERN_ERR, "failed to read boot sector");
- return -ENOMEM;
- }
- }
-
- bpb = (struct pbr64 *)sbi->pbr_bh->b_data;
bpb->bsx.vol_flags = cpu_to_le16(new_flag);
if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->pbr_bh))
@@ -159,7 +151,6 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",iocharset=utf8");
else if (sbi->nls_io)
seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);
- seq_printf(m, ",bps=%ld", sb->s_blocksize);
if (opts->errors == EXFAT_ERRORS_CONT)
seq_puts(m, ",errors=continue");
else if (opts->errors == EXFAT_ERRORS_PANIC)
@@ -212,6 +203,12 @@ enum {
Opt_errors,
Opt_discard,
Opt_time_offset,
+
+ /* Deprecated options */
+ Opt_utf8,
+ Opt_debug,
+ Opt_namecase,
+ Opt_codepage,
};
static const struct constant_table exfat_param_enums[] = {
@@ -232,6 +229,14 @@ static const struct fs_parameter_spec exfat_parameters[] = {
fsparam_enum("errors", Opt_errors, exfat_param_enums),
fsparam_flag("discard", Opt_discard),
fsparam_s32("time_offset", Opt_time_offset),
+ __fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated,
+ NULL),
+ __fsparam(NULL, "debug", Opt_debug, fs_param_deprecated,
+ NULL),
+ __fsparam(fs_param_is_u32, "namecase", Opt_namecase,
+ fs_param_deprecated, NULL),
+ __fsparam(fs_param_is_u32, "codepage", Opt_codepage,
+ fs_param_deprecated, NULL),
{}
};
@@ -287,6 +292,11 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
opts->time_offset = result.int_32;
break;
+ case Opt_utf8:
+ case Opt_debug:
+ case Opt_namecase:
+ case Opt_codepage:
+ break;
default:
return -EINVAL;
}
@@ -351,14 +361,15 @@ static int exfat_read_root(struct inode *inode)
exfat_save_attr(inode, ATTR_SUBDIR);
inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
current_time(inode);
+ exfat_truncate_atime(&inode->i_atime);
exfat_cache_init_inode(inode);
return 0;
}
-static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb,
- struct buffer_head **prev_bh)
+static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb)
{
- struct pbr *p_pbr = (struct pbr *) (*prev_bh)->b_data;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct pbr *p_pbr = (struct pbr *) (sbi->pbr_bh)->b_data;
unsigned short logical_sect = 0;
logical_sect = 1 << p_pbr->bsx.f64.sect_size_bits;
@@ -378,26 +389,23 @@ static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb,
}
if (logical_sect > sb->s_blocksize) {
- struct buffer_head *bh = NULL;
-
- __brelse(*prev_bh);
- *prev_bh = NULL;
+ brelse(sbi->pbr_bh);
+ sbi->pbr_bh = NULL;
if (!sb_set_blocksize(sb, logical_sect)) {
exfat_msg(sb, KERN_ERR,
"unable to set blocksize %u", logical_sect);
return NULL;
}
- bh = sb_bread(sb, 0);
- if (!bh) {
+ sbi->pbr_bh = sb_bread(sb, 0);
+ if (!sbi->pbr_bh) {
exfat_msg(sb, KERN_ERR,
"unable to read boot sector (logical sector size = %lu)",
sb->s_blocksize);
return NULL;
}
- *prev_bh = bh;
- p_pbr = (struct pbr *) bh->b_data;
+ p_pbr = (struct pbr *)sbi->pbr_bh->b_data;
}
return p_pbr;
}
@@ -408,21 +416,20 @@ static int __exfat_fill_super(struct super_block *sb)
int ret;
struct pbr *p_pbr;
struct pbr64 *p_bpb;
- struct buffer_head *bh;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
/* set block size to read super block */
sb_min_blocksize(sb, 512);
/* read boot sector */
- bh = sb_bread(sb, 0);
- if (!bh) {
+ sbi->pbr_bh = sb_bread(sb, 0);
+ if (!sbi->pbr_bh) {
exfat_msg(sb, KERN_ERR, "unable to read boot sector");
return -EIO;
}
/* PRB is read */
- p_pbr = (struct pbr *)bh->b_data;
+ p_pbr = (struct pbr *)sbi->pbr_bh->b_data;
/* check the validity of PBR */
if (le16_to_cpu((p_pbr->signature)) != PBR_SIGNATURE) {
@@ -433,7 +440,7 @@ static int __exfat_fill_super(struct super_block *sb)
/* check logical sector size */
- p_pbr = exfat_read_pbr_with_logical_sector(sb, &bh);
+ p_pbr = exfat_read_pbr_with_logical_sector(sb);
if (!p_pbr) {
ret = -EIO;
goto free_bh;
@@ -514,7 +521,7 @@ free_alloc_bitmap:
free_upcase_table:
exfat_free_upcase_table(sbi);
free_bh:
- brelse(bh);
+ brelse(sbi->pbr_bh);
return ret;
}
@@ -531,17 +538,18 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
if (opts->discard) {
struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
+ if (!blk_queue_discard(q)) {
exfat_msg(sb, KERN_WARNING,
"mounting with \"discard\" option, but the device does not support discard");
- opts->discard = 0;
+ opts->discard = 0;
+ }
}
sb->s_flags |= SB_NODIRATIME;
sb->s_magic = EXFAT_SUPER_MAGIC;
sb->s_op = &exfat_sops;
- sb->s_time_gran = 1;
+ sb->s_time_gran = 10 * NSEC_PER_MSEC;
sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS;
sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS;
@@ -605,6 +613,7 @@ put_inode:
free_table:
exfat_free_upcase_table(sbi);
exfat_free_bitmap(sbi);
+ brelse(sbi->pbr_bh);
check_nls_io:
unload_nls(sbi->nls_io);
@@ -717,6 +726,7 @@ static void __exit exit_exfat_fs(void)
module_init(init_exfat_fs);
module_exit(exit_exfat_fs);
+MODULE_ALIAS_FS("exfat");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("exFAT filesystem support");
MODULE_AUTHOR("Samsung Electronics Co., Ltd.");
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0e0a4d6209c7..a32e5f7b5385 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -410,7 +410,7 @@ verified:
* Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps
*
- * Return buffer_head on success or NULL in case of failure.
+ * Return buffer_head on success or an ERR_PTR in case of failure.
*/
struct buffer_head *
ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
@@ -502,7 +502,7 @@ out:
return ERR_PTR(err);
}
-/* Returns 0 on success, 1 on error */
+/* Returns 0 on success, -errno on error */
int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 91eb4381cae5..3147bb0cf82a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -722,7 +722,7 @@ enum {
#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
/* Max logical block we can support */
-#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF
+#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE
/*
* Structure of an inode on the disk
@@ -1357,11 +1357,9 @@ struct ext4_super_block {
*/
#define EXT4_MF_MNTDIR_SAMPLED 0x0001
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
-#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004
#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
- EXT4_MF_TEST_DUMMY_ENCRYPTION))
+#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL)
#else
#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
#endif
@@ -1551,6 +1549,9 @@ struct ext4_sb_info {
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
+ /* Encryption context for '-o test_dummy_encryption' */
+ struct fscrypt_dummy_context s_dummy_enc_ctx;
+
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
* or EXTENTS flag.
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7f16e1af8d5c..0c76cdd44d90 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -338,9 +338,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
if (inode && inode_needs_sync(inode)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
- struct ext4_super_block *es;
-
- es = EXT4_SB(inode->i_sb)->s_es;
ext4_error_inode_err(inode, where, line,
bh->b_blocknr, EIO,
"IO error syncing itable block");
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 031752cfb6f7..2b4b94542e34 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3374,8 +3374,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
(unsigned long long)map->m_lblk, map_len);
sbi = EXT4_SB(inode->i_sb);
- eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
- inode->i_sb->s_blocksize_bits;
+ eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
+ >> inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map_len)
eof_block = map->m_lblk + map_len;
@@ -3627,8 +3627,8 @@ static int ext4_split_convert_extents(handle_t *handle,
__func__, inode->i_ino,
(unsigned long long)map->m_lblk, map->m_len);
- eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
- inode->i_sb->s_blocksize_bits;
+ eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
+ >> inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
eof_block = map->m_lblk + map->m_len;
/*
@@ -4832,6 +4832,28 @@ static const struct iomap_ops ext4_iomap_xattr_ops = {
.iomap_begin = ext4_iomap_xattr_begin,
};
+static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
+{
+ u64 maxbytes;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ maxbytes = inode->i_sb->s_maxbytes;
+ else
+ maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+
+ if (*len == 0)
+ return -EINVAL;
+ if (start > maxbytes)
+ return -EFBIG;
+
+ /*
+ * Shrink request scope to what the fs can actually handle.
+ */
+ if (*len > maxbytes || (maxbytes - *len) < start)
+ *len = maxbytes - start;
+ return 0;
+}
+
static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, bool from_es_cache)
{
@@ -4852,6 +4874,15 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
return -EBADR;
+ /*
+ * For bitmap files the maximum size limit could be smaller than
+ * s_maxbytes, so check len here manually instead of just relying on the
+ * generic check.
+ */
+ error = ext4_fiemap_check_ranges(inode, start, &len);
+ if (error)
+ return error;
+
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
error = iomap_fiemap(inode, fieinfo, start, len,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b420c9dc444d..4b8c9a9bdf0c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -113,7 +113,7 @@ verified:
* Read the inode allocation bitmap for a given block_group, reading
* into the specified slot in the superblock's bitmap cache.
*
- * Return buffer_head of bitmap on success or NULL.
+ * Return buffer_head of bitmap on success, or an ERR_PTR on error.
*/
static struct buffer_head *
ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
@@ -662,7 +662,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
* block has been written back to disk. (Yes, these values are
* somewhat arbitrary...)
*/
-#define RECENTCY_MIN 5
+#define RECENTCY_MIN 60
#define RECENTCY_DIRTY 300
static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e416096fc081..2a4aae6acdcb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1973,7 +1973,7 @@ static int ext4_writepage(struct page *page,
bool keep_towrite = false;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
- ext4_invalidatepage(page, 0, PAGE_SIZE);
+ inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return -EIO;
}
@@ -4364,7 +4364,7 @@ make_io:
if (end > table)
end = table;
while (b <= end)
- sb_breadahead(sb, b++);
+ sb_breadahead_unmovable(sb, b++);
}
/*
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bfc1281fc4cb..0746532ba463 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -733,29 +733,6 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
}
-/* copied from fs/ioctl.c */
-static int fiemap_check_ranges(struct super_block *sb,
- u64 start, u64 len, u64 *new_len)
-{
- u64 maxbytes = (u64) sb->s_maxbytes;
-
- *new_len = len;
-
- if (len == 0)
- return -EINVAL;
-
- if (start > maxbytes)
- return -EFBIG;
-
- /*
- * Shrink request scope to what the fs can actually handle.
- */
- if (len > maxbytes || (maxbytes - len) < start)
- *new_len = maxbytes - start;
-
- return 0;
-}
-
/* So that the fiemap access checks can't overflow on 32 bit machines. */
#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
@@ -765,8 +742,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
struct fiemap_extent_info fieinfo = { 0, };
struct inode *inode = file_inode(filp);
- struct super_block *sb = inode->i_sb;
- u64 len;
int error;
if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
@@ -775,11 +750,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
return -EINVAL;
- error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
- &len);
- if (error)
- return error;
-
fieinfo.fi_flags = fiemap.fm_flags;
fieinfo.fi_extents_max = fiemap.fm_extent_count;
fieinfo.fi_extents_start = ufiemap->fm_extents;
@@ -792,7 +762,8 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
filemap_write_and_wait(inode->i_mapping);
- error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len);
+ error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start,
+ fiemap.fm_length);
fiemap.fm_flags = fieinfo.fi_flags;
fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 87c85be4c12e..30d5d97548c4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1943,7 +1943,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
int free;
free = e4b->bd_info->bb_free;
- BUG_ON(free <= 0);
+ if (WARN_ON(free <= 0))
+ return;
i = e4b->bd_info->bb_first_free;
@@ -1966,7 +1967,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
}
mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
- BUG_ON(ex.fe_len <= 0);
+ if (WARN_ON(ex.fe_len <= 0))
+ break;
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9728e7b0e84f..4a3d21972011 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -596,7 +596,6 @@ void __ext4_error_file(struct file *file, const char *function,
{
va_list args;
struct va_format vaf;
- struct ext4_super_block *es;
struct inode *inode = file_inode(file);
char pathname[80], *path;
@@ -604,7 +603,6 @@ void __ext4_error_file(struct file *file, const char *function,
return;
trace_ext4_error(inode->i_sb, function, line);
- es = EXT4_SB(inode->i_sb)->s_es;
if (ext4_error_ratelimit(inode->i_sb)) {
path = file_path(file, pathname, sizeof(pathname));
if (IS_ERR(path))
@@ -1108,6 +1106,7 @@ static void ext4_put_super(struct super_block *sb)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
+ fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
#ifdef CONFIG_UNICODE
utf8_unload(sbi->s_encoding);
#endif
@@ -1391,9 +1390,10 @@ retry:
return res;
}
-static bool ext4_dummy_context(struct inode *inode)
+static const union fscrypt_context *
+ext4_get_dummy_context(struct super_block *sb)
{
- return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
+ return EXT4_SB(sb)->s_dummy_enc_ctx.ctx;
}
static bool ext4_has_stable_inodes(struct super_block *sb)
@@ -1412,7 +1412,7 @@ static const struct fscrypt_operations ext4_cryptops = {
.key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
- .dummy_context = ext4_dummy_context,
+ .get_dummy_context = ext4_get_dummy_context,
.empty_dir = ext4_empty_dir,
.max_namelen = EXT4_NAME_LEN,
.has_stable_inodes = ext4_has_stable_inodes,
@@ -1607,6 +1607,7 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+ {Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
@@ -1818,7 +1819,7 @@ static const struct mount_opts {
{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
- {Opt_test_dummy_encryption, 0, MOPT_GTE0},
+ {Opt_test_dummy_encryption, 0, MOPT_STRING},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
{Opt_err, 0, 0}
};
@@ -1853,6 +1854,48 @@ static int ext4_sb_read_encoding(const struct ext4_super_block *es,
}
#endif
+static int ext4_set_test_dummy_encryption(struct super_block *sb,
+ const char *opt,
+ const substring_t *arg,
+ bool is_remount)
+{
+#ifdef CONFIG_FS_ENCRYPTION
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if (is_remount && !sbi->s_dummy_enc_ctx.ctx) {
+ ext4_msg(sb, KERN_WARNING,
+ "Can't set test_dummy_encryption on remount");
+ return -1;
+ }
+ err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx);
+ if (err) {
+ if (err == -EEXIST)
+ ext4_msg(sb, KERN_WARNING,
+ "Can't change test_dummy_encryption on remount");
+ else if (err == -EINVAL)
+ ext4_msg(sb, KERN_WARNING,
+ "Value of option \"%s\" is unrecognized", opt);
+ else
+ ext4_msg(sb, KERN_WARNING,
+ "Error processing option \"%s\" [%d]",
+ opt, err);
+ return -1;
+ }
+ ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
+#else
+ ext4_msg(sb, KERN_WARNING,
+ "Test dummy encryption mount option ignored");
+#endif
+ return 1;
+}
+
static int handle_mount_opt(struct super_block *sb, char *opt, int token,
substring_t *args, unsigned long *journal_devnum,
unsigned int *journal_ioprio, int is_remount)
@@ -2049,14 +2092,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
*journal_ioprio =
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
} else if (token == Opt_test_dummy_encryption) {
-#ifdef CONFIG_FS_ENCRYPTION
- sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
- ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mode enabled");
-#else
- ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mount option ignored");
-#endif
+ return ext4_set_test_dummy_encryption(sb, opt, &args[0],
+ is_remount);
} else if (m->flags & MOPT_DATAJ) {
if (is_remount) {
if (!sbi->s_journal)
@@ -2313,8 +2350,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
if (test_opt(sb, DATA_ERR_ABORT))
SEQ_OPTS_PUTS("data_err=abort");
- if (DUMMY_ENCRYPTION_ENABLED(sbi))
- SEQ_OPTS_PUTS("test_dummy_encryption");
+
+ fscrypt_show_test_dummy_encryption(seq, sep, sb);
ext4_show_quota_options(seq, sb);
return 0;
@@ -4340,7 +4377,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Pre-read the descriptors into the buffer cache */
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
- sb_breadahead(sb, block);
+ sb_breadahead_unmovable(sb, block);
}
for (i = 0; i < db_count; i++) {
@@ -4782,6 +4819,7 @@ failed_mount:
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
+ fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
ext4_blkdev_remove(sbi);
brelse(bh);
out_fail:
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 04bfaf63752c..6c9fc9e21c13 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -293,6 +293,7 @@ EXT4_ATTR_FEATURE(batched_discard);
EXT4_ATTR_FEATURE(meta_bg_resize);
#ifdef CONFIG_FS_ENCRYPTION
EXT4_ATTR_FEATURE(encryption);
+EXT4_ATTR_FEATURE(test_dummy_encryption_v2);
#endif
#ifdef CONFIG_UNICODE
EXT4_ATTR_FEATURE(casefold);
@@ -308,6 +309,7 @@ static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(meta_bg_resize),
#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
+ ATTR_LIST(test_dummy_encryption_v2),
#endif
#ifdef CONFIG_UNICODE
ATTR_LIST(casefold),
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ba470d5687fe..157eec348970 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -138,7 +138,7 @@ struct f2fs_mount_info {
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
int bggc_mode; /* bggc mode: off, on or sync */
- bool test_dummy_encryption; /* test dummy encryption */
+ struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */
block_t unusable_cap; /* Amount of space allowed to be
* unusable when disabling checkpoint
*/
@@ -1259,7 +1259,7 @@ enum fsync_mode {
#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) \
- (unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
+ (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL))
#else
#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
#endif
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 5bc4dcd8fc03..8c4ea5003ef8 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -12,7 +12,6 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
-#include <linux/cryptohash.h>
#include <linux/pagemap.h>
#include <linux/unicode.h>
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f2dfc21c6abb..8a9955902d84 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -202,6 +202,7 @@ static match_table_t f2fs_tokens = {
{Opt_whint, "whint_mode=%s"},
{Opt_alloc, "alloc_mode=%s"},
{Opt_fsync, "fsync_mode=%s"},
+ {Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
{Opt_checkpoint_disable, "checkpoint=disable"},
{Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
@@ -394,7 +395,52 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
}
#endif
-static int parse_options(struct super_block *sb, char *options)
+static int f2fs_set_test_dummy_encryption(struct super_block *sb,
+ const char *opt,
+ const substring_t *arg,
+ bool is_remount)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+#ifdef CONFIG_FS_ENCRYPTION
+ int err;
+
+ if (!f2fs_sb_has_encrypt(sbi)) {
+ f2fs_err(sbi, "Encrypt feature is off");
+ return -EINVAL;
+ }
+
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) {
+ f2fs_warn(sbi, "Can't set test_dummy_encryption on remount");
+ return -EINVAL;
+ }
+ err = fscrypt_set_test_dummy_encryption(
+ sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx);
+ if (err) {
+ if (err == -EEXIST)
+ f2fs_warn(sbi,
+ "Can't change test_dummy_encryption on remount");
+ else if (err == -EINVAL)
+ f2fs_warn(sbi, "Value of option \"%s\" is unrecognized",
+ opt);
+ else
+ f2fs_warn(sbi, "Error processing option \"%s\" [%d]",
+ opt, err);
+ return -EINVAL;
+ }
+ f2fs_warn(sbi, "Test dummy encryption mode enabled");
+#else
+ f2fs_warn(sbi, "Test dummy encryption mount option ignored");
+#endif
+ return 0;
+}
+
+static int parse_options(struct super_block *sb, char *options, bool is_remount)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
substring_t args[MAX_OPT_ARGS];
@@ -403,9 +449,7 @@ static int parse_options(struct super_block *sb, char *options)
int arg = 0, ext_cnt;
kuid_t uid;
kgid_t gid;
-#ifdef CONFIG_QUOTA
int ret;
-#endif
if (!options)
return 0;
@@ -778,17 +822,10 @@ static int parse_options(struct super_block *sb, char *options)
kvfree(name);
break;
case Opt_test_dummy_encryption:
-#ifdef CONFIG_FS_ENCRYPTION
- if (!f2fs_sb_has_encrypt(sbi)) {
- f2fs_err(sbi, "Encrypt feature is off");
- return -EINVAL;
- }
-
- F2FS_OPTION(sbi).test_dummy_encryption = true;
- f2fs_info(sbi, "Test dummy encryption mode enabled");
-#else
- f2fs_info(sbi, "Test dummy encryption mount option ignored");
-#endif
+ ret = f2fs_set_test_dummy_encryption(sb, p, &args[0],
+ is_remount);
+ if (ret)
+ return ret;
break;
case Opt_checkpoint_disable_cap_perc:
if (args->from && match_int(args, &arg))
@@ -1213,6 +1250,7 @@ static void f2fs_put_super(struct super_block *sb)
for (i = 0; i < MAXQUOTAS; i++)
kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
+ fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx);
destroy_percpu_info(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
@@ -1543,10 +1581,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",whint_mode=%s", "user-based");
else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
seq_printf(seq, ",whint_mode=%s", "fs-based");
-#ifdef CONFIG_FS_ENCRYPTION
- if (F2FS_OPTION(sbi).test_dummy_encryption)
- seq_puts(seq, ",test_dummy_encryption");
-#endif
+
+ fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb);
if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT)
seq_printf(seq, ",alloc_mode=%s", "default");
@@ -1575,7 +1611,6 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
- F2FS_OPTION(sbi).test_dummy_encryption = false;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4;
@@ -1734,7 +1769,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
default_options(sbi);
/* parse mount options */
- err = parse_options(sb, data);
+ err = parse_options(sb, data, true);
if (err)
goto restore_opts;
checkpoint_changed =
@@ -2410,9 +2445,10 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
ctx, len, fs_data, XATTR_CREATE);
}
-static bool f2fs_dummy_context(struct inode *inode)
+static const union fscrypt_context *
+f2fs_get_dummy_context(struct super_block *sb)
{
- return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode));
+ return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx;
}
static bool f2fs_has_stable_inodes(struct super_block *sb)
@@ -2431,7 +2467,7 @@ static const struct fscrypt_operations f2fs_cryptops = {
.key_prefix = "f2fs:",
.get_context = f2fs_get_context,
.set_context = f2fs_set_context,
- .dummy_context = f2fs_dummy_context,
+ .get_dummy_context = f2fs_get_dummy_context,
.empty_dir = f2fs_empty_dir,
.max_namelen = F2FS_NAME_LEN,
.has_stable_inodes = f2fs_has_stable_inodes,
@@ -3366,7 +3402,7 @@ try_onemore:
goto free_sb_buf;
}
- err = parse_options(sb, options);
+ err = parse_options(sb, options, false);
if (err)
goto free_options;
@@ -3769,6 +3805,7 @@ free_options:
for (i = 0; i < MAXQUOTAS; i++)
kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
+ fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx);
kvfree(options);
free_sb_buf:
kvfree(raw_super);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index e3bbbef9b4f0..3162f46b3c9b 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -446,6 +446,7 @@ enum feat_id {
FEAT_SB_CHECKSUM,
FEAT_CASEFOLD,
FEAT_COMPRESSION,
+ FEAT_TEST_DUMMY_ENCRYPTION_V2,
};
static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -466,6 +467,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
case FEAT_SB_CHECKSUM:
case FEAT_CASEFOLD:
case FEAT_COMPRESSION:
+ case FEAT_TEST_DUMMY_ENCRYPTION_V2:
return sprintf(buf, "supported\n");
}
return 0;
@@ -563,6 +565,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks);
#ifdef CONFIG_FS_ENCRYPTION
F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
+F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2);
#endif
#ifdef CONFIG_BLK_DEV_ZONED
F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED);
@@ -647,6 +650,7 @@ ATTRIBUTE_GROUPS(f2fs);
static struct attribute *f2fs_feat_attrs[] = {
#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
+ ATTR_LIST(test_dummy_encryption_v2),
#endif
#ifdef CONFIG_BLK_DEV_ZONED
ATTR_LIST(block_zoned),
diff --git a/fs/file.c b/fs/file.c
index c8a4e4c86e55..abb8b7081d7a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -70,7 +70,7 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
*/
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
- unsigned int cpy, set;
+ size_t cpy, set;
BUG_ON(nfdt->max_fds < ofdt->max_fds);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 936a8ec6b48e..6306eaae378b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -528,10 +528,12 @@ lower_metapath:
/* Advance in metadata tree. */
(mp->mp_list[hgt])++;
- if (mp->mp_list[hgt] >= sdp->sd_inptrs) {
- if (!hgt)
+ if (hgt) {
+ if (mp->mp_list[hgt] >= sdp->sd_inptrs)
+ goto lower_metapath;
+ } else {
+ if (mp->mp_list[hgt] >= sdp->sd_diptrs)
break;
- goto lower_metapath;
}
fill_up_metapath:
@@ -876,10 +878,9 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
ret = -ENOENT;
goto unlock;
} else {
- /* report a hole */
iomap->offset = pos;
iomap->length = length;
- goto do_alloc;
+ goto hole_found;
}
}
iomap->length = size;
@@ -933,8 +934,6 @@ unlock:
return ret;
do_alloc:
- iomap->addr = IOMAP_NULL_ADDR;
- iomap->type = IOMAP_HOLE;
if (flags & IOMAP_REPORT) {
if (pos >= size)
ret = -ENOENT;
@@ -956,6 +955,9 @@ do_alloc:
if (pos < size && height == ip->i_height)
ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
}
+hole_found:
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
goto out;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 29f9b6684b74..bf70e3b14938 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -613,7 +613,7 @@ __acquires(&gl->gl_lockref.lock)
fs_err(sdp, "Error %d syncing glock \n", ret);
gfs2_dump_glock(NULL, gl, true);
}
- return;
+ goto skip_inval;
}
}
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) {
@@ -633,6 +633,7 @@ __acquires(&gl->gl_lockref.lock)
clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
}
+skip_inval:
gfs2_glock_hold(gl);
/*
* Check for an error encountered since we called go_sync and go_inval.
@@ -722,9 +723,6 @@ __acquires(&gl->gl_lockref.lock)
goto out_unlock;
if (nonblock)
goto out_sched;
- smp_mb();
- if (atomic_read(&gl->gl_revokes) != 0)
- goto out_sched;
set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
gl->gl_target = gl->gl_demote_state;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 70b2d3a1e866..5acd3ce30759 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -622,7 +622,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = finish_no_open(file, NULL);
}
gfs2_glock_dq_uninit(ghs);
- return error;
+ goto fail;
} else if (error != -ENOENT) {
goto fail_gunlock;
}
@@ -764,9 +764,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = finish_open(file, dentry, gfs2_open_common);
}
gfs2_glock_dq_uninit(ghs);
+ gfs2_qa_put(ip);
gfs2_glock_dq_uninit(ghs + 1);
clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
gfs2_glock_put(io_gl);
+ gfs2_qa_put(dip);
return error;
fail_gunlock3:
@@ -776,7 +778,6 @@ fail_gunlock2:
clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
gfs2_glock_put(io_gl);
fail_free_inode:
- gfs2_qa_put(ip);
if (ip->i_gl) {
glock_clear_object(ip->i_gl, ip);
gfs2_glock_put(ip->i_gl);
@@ -1005,7 +1006,7 @@ out_gunlock:
out_child:
gfs2_glock_dq(ghs);
out_parent:
- gfs2_qa_put(ip);
+ gfs2_qa_put(dip);
gfs2_holder_uninit(ghs);
gfs2_holder_uninit(ghs + 1);
return error;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 3a75843ae580..0644e58c6191 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -669,13 +669,13 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
struct buffer_head *bh = bd->bd_bh;
struct gfs2_glock *gl = bd->bd_gl;
+ sdp->sd_log_num_revoke++;
+ if (atomic_inc_return(&gl->gl_revokes) == 1)
+ gfs2_glock_hold(gl);
bh->b_private = NULL;
bd->bd_blkno = bh->b_blocknr;
gfs2_remove_from_ail(bd); /* drops ref on bh */
bd->bd_bh = NULL;
- sdp->sd_log_num_revoke++;
- if (atomic_inc_return(&gl->gl_revokes) == 1)
- gfs2_glock_hold(gl);
set_bit(GLF_LFLUSH, &gl->gl_flags);
list_add(&bd->bd_list, &sdp->sd_log_revokes);
}
@@ -1131,6 +1131,10 @@ int gfs2_logd(void *data)
while (!kthread_should_stop()) {
+ if (gfs2_withdrawn(sdp)) {
+ msleep_interruptible(HZ);
+ continue;
+ }
/* Check for errors writing to the journal */
if (sdp->sd_log_error) {
gfs2_lm(sdp,
@@ -1139,6 +1143,7 @@ int gfs2_logd(void *data)
"prevent further damage.\n",
sdp->sd_fsname, sdp->sd_log_error);
gfs2_withdraw(sdp);
+ continue;
}
did_flush = false;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 5ea96757afc4..cb2a11b458c6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -263,7 +263,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
struct super_block *sb = sdp->sd_vfs;
struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
- bio->bi_iter.bi_sector = blkno << (sb->s_blocksize_bits - 9);
+ bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift;
bio_set_dev(bio, sb->s_bdev);
bio->bi_end_io = end_io;
bio->bi_private = sdp;
@@ -509,12 +509,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
unsigned int bsize = sdp->sd_sb.sb_bsize, off;
unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
unsigned int shift = PAGE_SHIFT - bsize_shift;
- unsigned int readahead_blocks = BIO_MAX_PAGES << shift;
+ unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift;
struct gfs2_journal_extent *je;
int sz, ret = 0;
struct bio *bio = NULL;
struct page *page = NULL;
- bool bio_chained = false, done = false;
+ bool done = false;
errseq_t since;
memset(head, 0, sizeof(*head));
@@ -537,30 +537,30 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
off = 0;
}
- if (!bio || (bio_chained && !off)) {
- /* start new bio */
- } else {
- sz = bio_add_page(bio, page, bsize, off);
- if (sz == bsize)
- goto block_added;
+ if (bio && (off || block < blocks_submitted + max_blocks)) {
+ sector_t sector = dblock << sdp->sd_fsb2bb_shift;
+
+ if (bio_end_sector(bio) == sector) {
+ sz = bio_add_page(bio, page, bsize, off);
+ if (sz == bsize)
+ goto block_added;
+ }
if (off) {
unsigned int blocks =
(PAGE_SIZE - off) >> bsize_shift;
bio = gfs2_chain_bio(bio, blocks);
- bio_chained = true;
goto add_block_to_new_bio;
}
}
if (bio) {
- blocks_submitted = block + 1;
+ blocks_submitted = block;
submit_bio(bio);
}
bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read);
bio->bi_opf = REQ_OP_READ;
- bio_chained = false;
add_block_to_new_bio:
sz = bio_add_page(bio, page, bsize, off);
BUG_ON(sz != bsize);
@@ -568,7 +568,7 @@ block_added:
off += bsize;
if (off == PAGE_SIZE)
page = NULL;
- if (blocks_submitted < blocks_read + readahead_blocks) {
+ if (blocks_submitted <= blocks_read + max_blocks) {
/* Keep at least one bio in flight */
continue;
}
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 4b72abcf83b2..9856cc2e0795 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -252,7 +252,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
int num = 0;
if (unlikely(gfs2_withdrawn(sdp)) &&
- (!sdp->sd_jdesc || (blkno != sdp->sd_jdesc->jd_no_addr))) {
+ (!sdp->sd_jdesc || gl != sdp->sd_jinode_gl)) {
*bhp = NULL;
return -EIO;
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index cc0c4b5800be..8259fef3f986 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1051,8 +1051,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
u32 x;
int error = 0;
- if (capable(CAP_SYS_RESOURCE) ||
- sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
error = gfs2_quota_hold(ip, uid, gid);
@@ -1125,7 +1124,7 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
int found;
if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
- goto out;
+ return;
for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
struct gfs2_quota_data *qd;
@@ -1162,7 +1161,6 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
qd_unlock(qda[x]);
}
-out:
gfs2_quota_unhold(ip);
}
@@ -1210,9 +1208,6 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
return 0;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
- return 0;
-
for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
qd = ip->i_qadata->qa_qd[x];
@@ -1270,7 +1265,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
if (ip->i_diskflags & GFS2_DIF_SYSTEM)
return;
- BUG_ON(ip->i_qadata->qa_ref <= 0);
+ if (gfs2_assert_withdraw(sdp, ip->i_qadata &&
+ ip->i_qadata->qa_ref > 0))
+ return;
for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
qd = ip->i_qadata->qa_qd[x];
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 7f9ca8ef40fc..21ada332d555 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -44,7 +44,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
int ret;
ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
- if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ if (capable(CAP_SYS_RESOURCE) ||
+ sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (ret)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 37fc41632aa2..956fced0a8ec 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1404,7 +1404,6 @@ out:
if (ip->i_qadata)
gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0);
gfs2_rs_delete(ip, NULL);
- gfs2_qa_put(ip);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9b64d40ab379..aa087a5675af 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -119,6 +119,12 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
if (!sb_rdonly(sdp->sd_vfs))
ret = gfs2_make_fs_ro(sdp);
+ if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */
+ if (!ret)
+ ret = -EIO;
+ clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
+ goto skip_recovery;
+ }
/*
* Drop the glock for our journal so another node can recover it.
*/
@@ -159,10 +165,6 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
wait_on_bit(&gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE);
}
- if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */
- clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
- goto skip_recovery;
- }
/*
* Dequeue the "live" glock, but keep a reference so it's never freed.
*/
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5190bfb6a665..bb25e3997d41 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -357,7 +357,6 @@ struct io_timeout_data {
struct hrtimer timer;
struct timespec64 ts;
enum hrtimer_mode mode;
- u32 seq_offset;
};
struct io_accept {
@@ -385,7 +384,7 @@ struct io_timeout {
struct file *file;
u64 addr;
int flags;
- unsigned count;
+ u32 count;
};
struct io_rw {
@@ -508,6 +507,7 @@ enum {
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+ REQ_F_LINK_HEAD_BIT,
REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
@@ -524,6 +524,7 @@ enum {
REQ_F_OVERFLOW_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
+ REQ_F_NO_FILE_TABLE_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -543,6 +544,8 @@ enum {
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
+ /* head of a link */
+ REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* already grabbed next link */
REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
/* fail rest of links */
@@ -575,6 +578,8 @@ enum {
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+ /* doesn't need file table for this request */
+ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
};
struct async_poll {
@@ -614,6 +619,8 @@ struct io_kiocb {
bool needs_fixed_file;
u8 opcode;
+ u16 buf_index;
+
struct io_ring_ctx *ctx;
struct list_head list;
unsigned int flags;
@@ -675,8 +682,6 @@ struct io_op_def {
unsigned needs_mm : 1;
/* needs req->file assigned */
unsigned needs_file : 1;
- /* needs req->file assigned IFF fd is >= 0 */
- unsigned fd_non_neg : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
@@ -779,8 +784,6 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
},
[IORING_OP_OPENAT] = {
- .needs_file = 1,
- .fd_non_neg = 1,
.file_table = 1,
.needs_fs = 1,
},
@@ -794,9 +797,8 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_STATX] = {
.needs_mm = 1,
- .needs_file = 1,
- .fd_non_neg = 1,
.needs_fs = 1,
+ .file_table = 1,
},
[IORING_OP_READ] = {
.needs_mm = 1,
@@ -831,8 +833,6 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
},
[IORING_OP_OPENAT2] = {
- .needs_file = 1,
- .fd_non_neg = 1,
.file_table = 1,
.needs_fs = 1,
},
@@ -926,6 +926,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;
ctx->flags = p->flags;
+ init_waitqueue_head(&ctx->sqo_wait);
init_waitqueue_head(&ctx->cq_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->completions[0]);
@@ -955,8 +956,8 @@ static inline bool __req_need_defer(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
- + atomic_read(&ctx->cached_cq_overflow);
+ return req->sequence != ctx->cached_cq_tail
+ + atomic_read(&ctx->cached_cq_overflow);
}
static inline bool req_need_defer(struct io_kiocb *req)
@@ -1289,7 +1290,7 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
struct io_kiocb *req;
req = ctx->fallback_req;
- if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
+ if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
return req;
return NULL;
@@ -1376,7 +1377,7 @@ static void __io_free_req(struct io_kiocb *req)
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
else
- clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
+ clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
}
struct req_batch {
@@ -1396,10 +1397,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
- if (req->flags & REQ_F_FIXED_FILE) {
- req->file = NULL;
- percpu_ref_put(req->fixed_file_refs);
- }
if (req->flags & REQ_F_INFLIGHT)
inflight++;
__io_req_aux_free(req);
@@ -1437,7 +1434,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
- req->flags &= ~REQ_F_LINK;
+ req->flags &= ~REQ_F_LINK_HEAD;
io_put_req(req);
return true;
}
@@ -1473,7 +1470,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
list_del_init(&req->link_list);
if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK;
+ nxt->flags |= REQ_F_LINK_HEAD;
*nxtptr = nxt;
break;
}
@@ -1484,7 +1481,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
}
/*
- * Called if REQ_F_LINK is set, and we fail the head request
+ * Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/
static void io_fail_links(struct io_kiocb *req)
{
@@ -1517,7 +1514,7 @@ static void io_fail_links(struct io_kiocb *req)
static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
{
- if (likely(!(req->flags & REQ_F_LINK)))
+ if (likely(!(req->flags & REQ_F_LINK_HEAD)))
return;
/*
@@ -1669,10 +1666,10 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
{
- if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
+ if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
return false;
- if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
+ if (req->file || req->io)
rb->need_iter++;
rb->reqs[rb->to_free++] = req;
@@ -2032,7 +2029,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
-static bool io_file_supports_async(struct file *file)
+static bool io_file_supports_async(struct file *file, int rw)
{
umode_t mode = file_inode(file)->i_mode;
@@ -2041,7 +2038,13 @@ static bool io_file_supports_async(struct file *file)
if (S_ISREG(mode) && file->f_op != &io_uring_fops)
return true;
- return false;
+ if (!(file->f_mode & FMODE_NOWAIT))
+ return false;
+
+ if (rw == READ)
+ return file->f_op->read_iter != NULL;
+
+ return file->f_op->write_iter != NULL;
}
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
@@ -2100,9 +2103,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
- /* we own ->private, reuse it for the buffer index / buffer ID */
- req->rw.kiocb.private = (void *) (unsigned long)
- READ_ONCE(sqe->buf_index);
+ req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -2145,7 +2146,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
struct io_ring_ctx *ctx = req->ctx;
size_t len = req->rw.len;
struct io_mapped_ubuf *imu;
- unsigned index, buf_index;
+ u16 index, buf_index;
size_t offset;
u64 buf_addr;
@@ -2153,7 +2154,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
if (unlikely(!ctx->user_bufs))
return -EFAULT;
- buf_index = (unsigned long) req->rw.kiocb.private;
+ buf_index = req->buf_index;
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
@@ -2269,10 +2270,10 @@ static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
bool needs_lock)
{
struct io_buffer *kbuf;
- int bgid;
+ u16 bgid;
kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
- bgid = (int) (unsigned long) req->rw.kiocb.private;
+ bgid = req->buf_index;
kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
if (IS_ERR(kbuf))
return kbuf;
@@ -2363,7 +2364,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
}
/* buffer index only valid with fixed read/write, or buffer select */
- if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT))
+ if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
@@ -2562,14 +2563,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
req->result = 0;
io_size = ret;
- if (req->flags & REQ_F_LINK)
+ if (req->flags & REQ_F_LINK_HEAD)
req->result = io_size;
/*
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
- if (force_nonblock && !io_file_supports_async(req->file))
+ if (force_nonblock && !io_file_supports_async(req->file, READ))
goto copy_iov;
iov_count = iov_iter_count(&iter);
@@ -2592,7 +2593,8 @@ copy_iov:
if (ret)
goto out_free;
/* any defer here is final, must blocking retry */
- if (!(req->flags & REQ_F_NOWAIT))
+ if (!(req->flags & REQ_F_NOWAIT) &&
+ !file_can_poll(req->file))
req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
@@ -2653,14 +2655,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
req->result = 0;
io_size = ret;
- if (req->flags & REQ_F_LINK)
+ if (req->flags & REQ_F_LINK_HEAD)
req->result = io_size;
/*
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
- if (force_nonblock && !io_file_supports_async(req->file))
+ if (force_nonblock && !io_file_supports_async(req->file, WRITE))
goto copy_iov;
/* file path doesn't support NOWAIT for non-direct_IO */
@@ -2714,7 +2716,8 @@ copy_iov:
if (ret)
goto out_free;
/* any defer here is final, must blocking retry */
- req->flags |= REQ_F_MUST_PUNT;
+ if (!file_can_poll(req->file))
+ req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
}
@@ -2754,15 +2757,6 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static bool io_splice_punt(struct file *file)
-{
- if (get_pipe_info(file))
- return false;
- if (!io_file_supports_async(file))
- return true;
- return !(file->f_mode & O_NONBLOCK);
-}
-
static int io_splice(struct io_kiocb *req, bool force_nonblock)
{
struct io_splice *sp = &req->splice;
@@ -2770,19 +2764,16 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
loff_t *poff_in, *poff_out;
- long ret;
+ long ret = 0;
- if (force_nonblock) {
- if (io_splice_punt(in) || io_splice_punt(out))
- return -EAGAIN;
- flags |= SPLICE_F_NONBLOCK;
- }
+ if (force_nonblock)
+ return -EAGAIN;
poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
- ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
+
+ if (sp->len)
+ ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -3353,8 +3344,12 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
struct kstat stat;
int ret;
- if (force_nonblock)
+ if (force_nonblock) {
+ /* only need file table for an actual valid fd */
+ if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
+ req->flags |= REQ_F_NO_FILE_TABLE;
return -EAGAIN;
+ }
if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
return -EINVAL;
@@ -3500,7 +3495,7 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr)
if (io_req_cancelled(req))
return;
__io_sync_file_range(req);
- io_put_req(req); /* put submission ref */
+ io_steal_work(req, workptr);
}
static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
@@ -4140,12 +4135,14 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
req->result = mask;
init_task_work(&req->task_work, func);
/*
- * If this fails, then the task is exiting. Punt to one of the io-wq
- * threads to ensure the work gets run, we can't always rely on exit
- * cancelation taking care of this.
+ * If this fails, then the task is exiting. When a task exits, the
+ * work gets canceled, so just cancel this request as well instead
+ * of executing it. We can't safely execute it anyway, as we may not
+ * have the needed state needed for it anyway.
*/
ret = task_work_add(tsk, &req->task_work, true);
if (unlikely(ret)) {
+ WRITE_ONCE(poll->canceled, true);
tsk = io_wq_get_task(req->ctx->io_wq);
task_work_add(tsk, &req->task_work, true);
}
@@ -4153,25 +4150,62 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
return 1;
}
+static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
+ __acquires(&req->ctx->completion_lock)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ struct poll_table_struct pt = { ._key = poll->events };
+
+ req->result = vfs_poll(req->file, &pt) & poll->events;
+ }
+
+ spin_lock_irq(&ctx->completion_lock);
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ add_wait_queue(poll->head, &poll->wait);
+ return true;
+ }
+
+ return false;
+}
+
static void io_async_task_func(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
+ bool canceled;
trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
- WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
+ if (io_poll_rewait(req, &apoll->poll)) {
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
- if (hash_hashed(&req->hash_node)) {
- spin_lock_irq(&ctx->completion_lock);
+ if (hash_hashed(&req->hash_node))
hash_del(&req->hash_node);
- spin_unlock_irq(&ctx->completion_lock);
+
+ canceled = READ_ONCE(apoll->poll.canceled);
+ if (canceled) {
+ io_cqring_fill_event(req, -ECANCELED);
+ io_commit_cqring(ctx);
}
+ spin_unlock_irq(&ctx->completion_lock);
+
/* restore ->work in case we need to retry again */
memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (canceled) {
+ kfree(apoll);
+ io_cqring_ev_posted(ctx);
+ req_set_fail_links(req);
+ io_double_put_req(req);
+ return;
+ }
+
__set_current_state(TASK_RUNNING);
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
@@ -4315,11 +4349,13 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
static bool io_poll_remove_one(struct io_kiocb *req)
{
+ struct async_poll *apoll = NULL;
bool do_complete;
if (req->opcode == IORING_OP_POLL_ADD) {
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
+ apoll = req->apoll;
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &req->apoll->poll);
if (do_complete)
@@ -4328,6 +4364,14 @@ static bool io_poll_remove_one(struct io_kiocb *req)
hash_del(&req->hash_node);
+ if (do_complete && apoll) {
+ /*
+ * restore ->work because we need to call io_req_work_drop_env.
+ */
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll);
+ }
+
if (do_complete) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
@@ -4342,7 +4386,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
{
struct hlist_node *tmp;
struct io_kiocb *req;
- int i;
+ int posted = 0, i;
spin_lock_irq(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
@@ -4350,11 +4394,12 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node)
- io_poll_remove_one(req);
+ posted += io_poll_remove_one(req);
}
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
+ if (posted)
+ io_cqring_ev_posted(ctx);
}
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
@@ -4423,18 +4468,11 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
struct io_ring_ctx *ctx = req->ctx;
struct io_poll_iocb *poll = &req->poll;
- if (!req->result && !READ_ONCE(poll->canceled)) {
- struct poll_table_struct pt = { ._key = poll->events };
-
- req->result = vfs_poll(req->file, &pt) & poll->events;
- }
-
- spin_lock_irq(&ctx->completion_lock);
- if (!req->result && !READ_ONCE(poll->canceled)) {
- add_wait_queue(poll->head, &poll->wait);
+ if (io_poll_rewait(req, poll)) {
spin_unlock_irq(&ctx->completion_lock);
return;
}
+
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
req->flags |= REQ_F_COMP_LOCKED;
@@ -4665,11 +4703,12 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static int io_timeout(struct io_kiocb *req)
{
- unsigned count;
struct io_ring_ctx *ctx = req->ctx;
struct io_timeout_data *data;
struct list_head *entry;
unsigned span = 0;
+ u32 count = req->timeout.count;
+ u32 seq = req->sequence;
data = &req->io->timeout;
@@ -4678,7 +4717,6 @@ static int io_timeout(struct io_kiocb *req)
* timeout event to be satisfied. If it isn't set, then this is
* a pure timeout request, sequence isn't used.
*/
- count = req->timeout.count;
if (!count) {
req->flags |= REQ_F_TIMEOUT_NOSEQ;
spin_lock_irq(&ctx->completion_lock);
@@ -4686,8 +4724,7 @@ static int io_timeout(struct io_kiocb *req)
goto add;
}
- req->sequence = ctx->cached_sq_head + count - 1;
- data->seq_offset = count;
+ req->sequence = seq + count;
/*
* Insertion sort, ensuring the first entry in the list is always
@@ -4696,26 +4733,26 @@ static int io_timeout(struct io_kiocb *req)
spin_lock_irq(&ctx->completion_lock);
list_for_each_prev(entry, &ctx->timeout_list) {
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
- unsigned nxt_sq_head;
+ unsigned nxt_seq;
long long tmp, tmp_nxt;
- u32 nxt_offset = nxt->io->timeout.seq_offset;
+ u32 nxt_offset = nxt->timeout.count;
if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
continue;
/*
- * Since cached_sq_head + count - 1 can overflow, use type long
+ * Since seq + count can overflow, use type long
* long to store it.
*/
- tmp = (long long)ctx->cached_sq_head + count - 1;
- nxt_sq_head = nxt->sequence - nxt_offset + 1;
- tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
+ tmp = (long long)seq + count;
+ nxt_seq = nxt->sequence - nxt_offset;
+ tmp_nxt = (long long)nxt_seq + nxt_offset;
/*
* cached_sq_head may overflow, and it will never overflow twice
* once there is some timeout req still be valid.
*/
- if (ctx->cached_sq_head < nxt_sq_head)
+ if (seq < nxt_seq)
tmp += UINT_MAX;
if (tmp > tmp_nxt)
@@ -4973,15 +5010,16 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int ret;
/* Still need defer if there is pending req in defer list. */
- if (!req_need_defer(req) && list_empty(&ctx->defer_list))
+ if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
return 0;
- if (!req->io && io_alloc_async_ctx(req))
- return -EAGAIN;
-
- ret = io_req_defer_prep(req, sqe);
- if (ret < 0)
- return ret;
+ if (!req->io) {
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
+ ret = io_req_defer_prep(req, sqe);
+ if (ret < 0)
+ return ret;
+ }
spin_lock_irq(&ctx->completion_lock);
if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
@@ -5268,7 +5306,8 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
return ret;
- if (ctx->flags & IORING_SETUP_IOPOLL) {
+ /* If the op doesn't have a file, we're not polling for it */
+ if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
const bool in_async = io_wq_current_is_worker();
if (req->result == -EAGAIN)
@@ -5322,15 +5361,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
io_steal_work(req, workptr);
}
-static int io_req_needs_file(struct io_kiocb *req, int fd)
-{
- if (!io_op_defs[req->opcode].needs_file)
- return 0;
- if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg)
- return 0;
- return 1;
-}
-
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
int index)
{
@@ -5368,14 +5398,11 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- int fd, unsigned int flags)
+ int fd)
{
bool fixed;
- if (!io_req_needs_file(req, fd))
- return 0;
-
- fixed = (flags & IOSQE_FIXED_FILE);
+ fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
if (unlikely(!fixed && req->needs_fixed_file))
return -EBADF;
@@ -5387,7 +5414,7 @@ static int io_grab_files(struct io_kiocb *req)
int ret = -EBADF;
struct io_ring_ctx *ctx = req->ctx;
- if (req->work.files)
+ if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
return 0;
if (!ctx->ring_file)
return -EBADF;
@@ -5476,7 +5503,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
struct io_kiocb *nxt;
- if (!(req->flags & REQ_F_LINK))
+ if (!(req->flags & REQ_F_LINK_HEAD))
return NULL;
/* for polled retry, if flag is set, we already went through here */
if (req->flags & REQ_F_POLLED)
@@ -5581,9 +5608,15 @@ fail_req:
io_double_put_req(req);
}
} else if (req->flags & REQ_F_FORCE_ASYNC) {
- ret = io_req_defer_prep(req, sqe);
- if (unlikely(ret < 0))
- goto fail_req;
+ if (!req->io) {
+ ret = -EAGAIN;
+ if (io_alloc_async_ctx(req))
+ goto fail_req;
+ ret = io_req_defer_prep(req, sqe);
+ if (unlikely(ret < 0))
+ goto fail_req;
+ }
+
/*
* Never try inline submit of IOSQE_ASYNC is set, go straight
* to async execution.
@@ -5604,54 +5637,11 @@ static inline void io_queue_link_head(struct io_kiocb *req)
io_queue_sqe(req, NULL);
}
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
-
-static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned int sqe_flags;
- int ret, id, fd;
-
- sqe_flags = READ_ONCE(sqe->flags);
-
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
- ret = -EINVAL;
- goto err_req;
- }
-
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select) {
- ret = -EOPNOTSUPP;
- goto err_req;
- }
-
- id = READ_ONCE(sqe->personality);
- if (id) {
- req->work.creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!req->work.creds)) {
- ret = -EINVAL;
- goto err_req;
- }
- get_cred(req->work.creds);
- }
-
- /* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
- IOSQE_ASYNC | IOSQE_FIXED_FILE |
- IOSQE_BUFFER_SELECT);
-
- fd = READ_ONCE(sqe->fd);
- ret = io_req_set_file(state, req, fd, sqe_flags);
- if (unlikely(ret)) {
-err_req:
- io_cqring_add_event(req, ret);
- io_double_put_req(req);
- return false;
- }
+ int ret;
/*
* If we already have a head request, queue this one for async
@@ -5670,42 +5660,39 @@ err_req:
* next after the link request. The last one is done via
* drain_next flag to persist the effect across calls.
*/
- if (sqe_flags & IOSQE_IO_DRAIN) {
+ if (req->flags & REQ_F_IO_DRAIN) {
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
- if (io_alloc_async_ctx(req)) {
- ret = -EAGAIN;
- goto err_req;
- }
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
ret = io_req_defer_prep(req, sqe);
if (ret) {
/* fail even hard links since we don't submit */
head->flags |= REQ_F_FAIL_LINK;
- goto err_req;
+ return ret;
}
trace_io_uring_link(ctx, req, head);
list_add_tail(&req->link_list, &head->link_list);
/* last request of a link, enqueue the link */
- if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
+ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
io_queue_link_head(head);
*link = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
req->flags |= REQ_F_IO_DRAIN;
- req->ctx->drain_next = 0;
+ ctx->drain_next = 0;
}
- if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
- req->flags |= REQ_F_LINK;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ req->flags |= REQ_F_LINK_HEAD;
INIT_LIST_HEAD(&req->link_list);
- if (io_alloc_async_ctx(req)) {
- ret = -EAGAIN;
- goto err_req;
- }
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
+
ret = io_req_defer_prep(req, sqe);
if (ret)
req->flags |= REQ_F_FAIL_LINK;
@@ -5715,7 +5702,7 @@ err_req:
}
}
- return true;
+ return 0;
}
/*
@@ -5789,15 +5776,23 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx)
ctx->cached_sq_head++;
}
-static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+ IOSQE_BUFFER_SELECT)
+
+static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe,
+ struct io_submit_state *state, bool async)
{
+ unsigned int sqe_flags;
+ int id;
+
/*
* All io need record the previous position, if LINK vs DARIN,
* it can be used to mark the position of the first IO in the
* link list.
*/
- req->sequence = ctx->cached_sq_head;
+ req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
req->opcode = READ_ONCE(sqe->opcode);
req->user_data = READ_ONCE(sqe->user_data);
req->io = NULL;
@@ -5808,17 +5803,52 @@ static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
refcount_set(&req->refs, 2);
req->task = NULL;
req->result = 0;
+ req->needs_fixed_file = async;
INIT_IO_WORK(&req->work, io_wq_submit_work);
+
+ if (unlikely(req->opcode >= IORING_OP_LAST))
+ return -EINVAL;
+
+ if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+ if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ use_mm(ctx->sqo_mm);
+ }
+
+ sqe_flags = READ_ONCE(sqe->flags);
+ /* enforce forwards compatibility on users */
+ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
+ return -EINVAL;
+
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[req->opcode].buffer_select)
+ return -EOPNOTSUPP;
+
+ id = READ_ONCE(sqe->personality);
+ if (id) {
+ req->work.creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->work.creds))
+ return -EINVAL;
+ get_cred(req->work.creds);
+ }
+
+ /* same numerical values with corresponding REQ_F_*, safe to copy */
+ req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
+ IOSQE_ASYNC | IOSQE_FIXED_FILE |
+ IOSQE_BUFFER_SELECT | IOSQE_IO_LINK);
+
+ if (!io_op_defs[req->opcode].needs_file)
+ return 0;
+
+ return io_req_set_file(state, req, READ_ONCE(sqe->fd));
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
- struct file *ring_file, int ring_fd,
- struct mm_struct **mm, bool async)
+ struct file *ring_file, int ring_fd, bool async)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
int i, submitted = 0;
- bool mm_fault = false;
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
@@ -5858,34 +5888,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
break;
}
- io_init_req(ctx, req, sqe);
+ err = io_init_req(ctx, req, sqe, statep, async);
io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
- if (unlikely(req->opcode >= IORING_OP_LAST)) {
- err = -EINVAL;
+ if (unlikely(err)) {
fail_req:
io_cqring_add_event(req, err);
io_double_put_req(req);
break;
}
- if (io_op_defs[req->opcode].needs_mm && !*mm) {
- mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
- if (unlikely(mm_fault)) {
- err = -EFAULT;
- goto fail_req;
- }
- use_mm(ctx->sqo_mm);
- *mm = ctx->sqo_mm;
- }
-
- req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
true, async);
- if (!io_submit_sqe(req, sqe, statep, &link))
- break;
+ err = io_submit_sqe(req, sqe, statep, &link);
+ if (err)
+ goto fail_req;
}
if (unlikely(submitted != nr)) {
@@ -5904,10 +5923,19 @@ fail_req:
return submitted;
}
+static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
- struct mm_struct *cur_mm = NULL;
const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
@@ -5948,11 +5976,7 @@ static int io_sq_thread(void *data)
* adding ourselves to the waitqueue, as the unuse/drop
* may sleep.
*/
- if (cur_mm) {
- unuse_mm(cur_mm);
- mmput(cur_mm);
- cur_mm = NULL;
- }
+ io_sq_thread_drop_mm(ctx);
/*
* We're polling. If we're within the defined idle
@@ -6008,6 +6032,7 @@ static int io_sq_thread(void *data)
finish_wait(&ctx->sqo_wait, &wait);
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ ret = 0;
continue;
}
finish_wait(&ctx->sqo_wait, &wait);
@@ -6016,7 +6041,7 @@ static int io_sq_thread(void *data)
}
mutex_lock(&ctx->uring_lock);
- ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+ ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
mutex_unlock(&ctx->uring_lock);
timeout = jiffies + ctx->sq_thread_idle;
}
@@ -6025,10 +6050,7 @@ static int io_sq_thread(void *data)
task_work_run();
set_fs(old_fs);
- if (cur_mm) {
- unuse_mm(cur_mm);
- mmput(cur_mm);
- }
+ io_sq_thread_drop_mm(ctx);
revert_creds(old_cred);
kthread_parkme();
@@ -6824,7 +6846,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
{
int ret;
- init_waitqueue_head(&ctx->sqo_wait);
mmgrab(current->mm);
ctx->sqo_mm = current->mm;
@@ -7299,7 +7320,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
* it could cause shutdown to hang.
*/
while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
- cpu_relax();
+ cond_resched();
io_kill_timeouts(ctx);
io_poll_remove_all(ctx);
@@ -7328,11 +7349,9 @@ static int io_uring_release(struct inode *inode, struct file *file)
static void io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
- struct io_kiocb *req;
- DEFINE_WAIT(wait);
-
while (!list_empty_careful(&ctx->inflight_list)) {
- struct io_kiocb *cancel_req = NULL;
+ struct io_kiocb *cancel_req = NULL, *req;
+ DEFINE_WAIT(wait);
spin_lock_irq(&ctx->inflight_lock);
list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
@@ -7372,6 +7391,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
*/
if (refcount_sub_and_test(2, &cancel_req->refs)) {
io_put_req(cancel_req);
+ finish_wait(&ctx->inflight_wait, &wait);
continue;
}
}
@@ -7379,8 +7399,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
io_put_req(cancel_req);
schedule();
+ finish_wait(&ctx->inflight_wait, &wait);
}
- finish_wait(&ctx->inflight_wait, &wait);
}
static int io_uring_flush(struct file *file, void *data)
@@ -7509,13 +7529,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
wake_up(&ctx->sqo_wait);
submitted = to_submit;
} else if (to_submit) {
- struct mm_struct *cur_mm;
-
mutex_lock(&ctx->uring_lock);
- /* already have mm, so io_submit_sqes() won't try to grab it */
- cur_mm = ctx->sqo_mm;
- submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
- &cur_mm, false);
+ submitted = io_submit_sqes(ctx, to_submit, f.file, fd, false);
mutex_unlock(&ctx->uring_lock);
if (submitted != to_submit)
@@ -7734,7 +7749,8 @@ err:
return ret;
}
-static int io_uring_create(unsigned entries, struct io_uring_params *p)
+static int io_uring_create(unsigned entries, struct io_uring_params *p,
+ struct io_uring_params __user *params)
{
struct user_struct *user = NULL;
struct io_ring_ctx *ctx;
@@ -7826,6 +7842,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
p->cq_off.cqes = offsetof(struct io_rings, cqes);
+ p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
+ IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
+ IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
+
+ if (copy_to_user(params, p, sizeof(*p))) {
+ ret = -EFAULT;
+ goto err;
+ }
/*
* Install ring fd as the very last thing, so we don't risk someone
* having closed it before we finish setup
@@ -7834,9 +7858,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
if (ret < 0)
goto err;
- p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
- IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
- IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
@@ -7852,7 +7873,6 @@ err:
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{
struct io_uring_params p;
- long ret;
int i;
if (copy_from_user(&p, params, sizeof(p)))
@@ -7867,14 +7887,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
return -EINVAL;
- ret = io_uring_create(entries, &p);
- if (ret < 0)
- return ret;
-
- if (copy_to_user(params, &p, sizeof(p)))
- return -EFAULT;
-
- return ret;
+ return io_uring_create(entries, &p, params);
}
SYSCALL_DEFINE2(io_uring_setup, u32, entries,
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 282d45be6f45..5e80b40bc1b5 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(vfs_ioctl);
static int ioctl_fibmap(struct file *filp, int __user *p)
{
struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
int error, ur_block;
sector_t block;
@@ -71,6 +72,13 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
block = ur_block;
error = bmap(inode, &block);
+ if (block > INT_MAX) {
+ error = -ERANGE;
+ pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n",
+ current->comm, task_pid_nr(current),
+ sb->s_id, filp);
+ }
+
if (error)
ur_block = 0;
else
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index bccf305ea9ce..d55e8f491a5e 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -117,10 +117,7 @@ iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
if (iomap->type == IOMAP_MAPPED) {
addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
- if (addr > INT_MAX)
- WARN(1, "would truncate bmap result\n");
- else
- *bno = addr;
+ *bno = addr;
}
return 0;
}
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 1abf126c2df4..a60df88efc40 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -118,8 +118,6 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int
nfss->fscache_key = NULL;
nfss->fscache = NULL;
- if (!(nfss->options & NFS_OPTION_FSCACHE))
- return;
if (!uniq) {
uniq = "";
ulen = 1;
@@ -188,7 +186,8 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int
/* create a cache index for looking up filehandles */
nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
&nfs_fscache_super_index_def,
- key, sizeof(*key) + ulen,
+ &key->key,
+ sizeof(key->key) + ulen,
NULL, 0,
nfss, 0, true);
dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
@@ -226,6 +225,19 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
}
}
+static void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
+ struct nfs_inode *nfsi)
+{
+ memset(auxdata, 0, sizeof(*auxdata));
+ auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
+ auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
+ auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
+ auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
+
+ if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+ auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+}
+
/*
* Initialise the per-inode cache cookie pointer for an NFS inode.
*/
@@ -239,14 +251,7 @@ void nfs_fscache_init_inode(struct inode *inode)
if (!(nfss->fscache && S_ISREG(inode->i_mode)))
return;
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
- auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
- auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
- auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
-
- if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+ nfs_fscache_update_auxdata(&auxdata, nfsi);
nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
&nfs_fscache_inode_object_def,
@@ -266,11 +271,7 @@ void nfs_fscache_clear_inode(struct inode *inode)
dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
- auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
- auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
- auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
+ nfs_fscache_update_auxdata(&auxdata, nfsi);
fscache_relinquish_cookie(cookie, &auxdata, false);
nfsi->fscache = NULL;
}
@@ -310,11 +311,7 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp)
if (!fscache_cookie_valid(cookie))
return;
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
- auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
- auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
- auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
+ nfs_fscache_update_auxdata(&auxdata, nfsi);
if (inode_is_open_for_write(inode)) {
dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 35c8cb2d7637..dda5c3e65d8d 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -30,6 +30,7 @@
#define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN))
#define MNT_status_sz (1)
#define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE)
+#define MNT_fhandlev3_sz XDR_QUADLEN(NFS3_FHSIZE)
#define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS)
/*
@@ -37,7 +38,7 @@
*/
#define MNT_enc_dirpath_sz encode_dirpath_sz
#define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz)
-#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \
+#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandlev3_sz + \
MNT_authflav3_sz)
/*
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index c5c3fc6e6c60..26c94b32d6f4 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -253,37 +253,45 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *alloc = NULL, *dfacl = NULL;
+ struct posix_acl *orig = acl, *dfacl = NULL, *alloc;
int status;
if (S_ISDIR(inode->i_mode)) {
switch(type) {
case ACL_TYPE_ACCESS:
- alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT);
+ alloc = get_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(alloc))
goto fail;
+ dfacl = alloc;
break;
case ACL_TYPE_DEFAULT:
- dfacl = acl;
- alloc = acl = get_acl(inode, ACL_TYPE_ACCESS);
+ alloc = get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(alloc))
goto fail;
+ dfacl = acl;
+ acl = alloc;
break;
}
}
if (acl == NULL) {
- alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+ alloc = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
if (IS_ERR(alloc))
goto fail;
+ acl = alloc;
}
status = __nfs3_proc_setacls(inode, acl, dfacl);
- posix_acl_release(alloc);
+out:
+ if (acl != orig)
+ posix_acl_release(acl);
+ if (dfacl != orig)
+ posix_acl_release(dfacl);
return status;
fail:
- return PTR_ERR(alloc);
+ status = PTR_ERR(alloc);
+ goto out;
}
const struct xattr_handler *nfs3_xattr_handlers[] = {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 512afb1c7867..9056f3dd380e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6347,7 +6347,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
.rpc_client = server->client,
.rpc_message = &msg,
.callback_ops = &nfs4_delegreturn_ops,
- .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | RPC_TASK_TIMEOUT,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
};
int status = 0;
@@ -7891,6 +7891,7 @@ static void
nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
{
struct nfs41_bind_conn_to_session_args *args = task->tk_msg.rpc_argp;
+ struct nfs41_bind_conn_to_session_res *res = task->tk_msg.rpc_resp;
struct nfs_client *clp = args->client;
switch (task->tk_status) {
@@ -7899,6 +7900,12 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
nfs4_schedule_session_recovery(clp->cl_session,
task->tk_status);
}
+ if (args->dir == NFS4_CDFC4_FORE_OR_BOTH &&
+ res->dir != NFS4_CDFS4_BOTH) {
+ rpc_task_close_connection(task);
+ if (args->retries++ < MAX_BIND_CONN_TO_SESSION_RETRIES)
+ rpc_restart_call(task);
+ }
}
static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = {
@@ -7921,6 +7928,7 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
struct nfs41_bind_conn_to_session_args args = {
.client = clp,
.dir = NFS4_CDFC4_FORE_OR_BOTH,
+ .retries = 0,
};
struct nfs41_bind_conn_to_session_res res;
struct rpc_message msg = {
@@ -9191,8 +9199,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
- return ERR_CAST(task);
+
status = rpc_wait_for_completion_task(task);
if (status != 0)
goto out;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ac93715c05a4..a8dc25ce48bb 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -734,9 +734,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
state = new;
state->owner = owner;
atomic_inc(&owner->so_count);
- list_add_rcu(&state->inode_states, &nfsi->open_states);
ihold(inode);
state->inode = inode;
+ list_add_rcu(&state->inode_states, &nfsi->open_states);
spin_unlock(&inode->i_lock);
/* Note: The reclaim code dictates that we add stateless
* and read-only stateids to the end of the list */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index f61f96603df7..6ca421cbe19c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -752,7 +752,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
.callback_ops = call_ops,
.callback_data = hdr,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags,
+ .flags = RPC_TASK_ASYNC | flags,
};
hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
@@ -950,7 +950,8 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
hdr->cred,
NFS_PROTO(hdr->inode),
desc->pg_rpc_callops,
- desc->pg_ioflags, 0);
+ desc->pg_ioflags,
+ RPC_TASK_CRED_NOREF);
return ret;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f2dc35c22964..dd2e14f5875d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1332,13 +1332,15 @@ _pnfs_return_layout(struct inode *ino)
!valid_layout) {
spin_unlock(&ino->i_lock);
dprintk("NFS: %s no layout segments to return\n", __func__);
- goto out_put_layout_hdr;
+ goto out_wait_layoutreturn;
}
send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
spin_unlock(&ino->i_lock);
if (send)
status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
+out_wait_layoutreturn:
+ wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE);
out_put_layout_hdr:
pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
@@ -1456,18 +1458,15 @@ retry:
/* lo ref dropped in pnfs_roc_release() */
layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
/* If the creds don't match, we can't compound the layoutreturn */
- if (!layoutreturn)
+ if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0)
goto out_noroc;
- if (cred_fscmp(cred, lc_cred) != 0)
- goto out_noroc_put_cred;
roc = layoutreturn;
pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
res->lrs_present = 0;
layoutreturn = false;
-
-out_noroc_put_cred:
put_cred(lc_cred);
+
out_noroc:
spin_unlock(&ino->i_lock);
rcu_read_unlock();
@@ -2023,6 +2022,7 @@ lookup_again:
goto lookup_again;
}
+ spin_unlock(&ino->i_lock);
first = true;
status = nfs4_select_rw_stateid(ctx->state,
iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
@@ -2032,12 +2032,12 @@ lookup_again:
trace_pnfs_update_layout(ino, pos, count,
iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_INVALID_OPEN);
- spin_unlock(&ino->i_lock);
nfs4_schedule_stateid_recovery(server, ctx->state);
pnfs_clear_first_layoutget(lo);
pnfs_put_layout_hdr(lo);
goto lookup_again;
}
+ spin_lock(&ino->i_lock);
} else {
nfs4_stateid_copy(&stateid, &lo->plh_stateid);
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index e7ddbce48321..679767ac258d 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -536,7 +536,8 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
nfs_init_commit(data, NULL, NULL, cinfo);
nfs_initiate_commit(NFS_CLIENT(inode), data,
NFS_PROTO(data->inode),
- data->mds_ops, how, 0);
+ data->mds_ops, how,
+ RPC_TASK_CRED_NOREF);
} else {
nfs_init_commit(data, NULL, data->lseg, cinfo);
initiate_commit(data, how);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 59ef3b13ccca..7a70287f21a2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -185,7 +185,7 @@ static int __nfs_list_for_each_server(struct list_head *head,
rcu_read_lock();
list_for_each_entry_rcu(server, head, client_link) {
- if (!nfs_sb_active(server->super))
+ if (!(server->super && nfs_sb_active(server->super)))
continue;
rcu_read_unlock();
if (last)
@@ -1189,7 +1189,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
uniq = ctx->fscache_uniq;
ulen = strlen(ctx->fscache_uniq);
}
- return;
}
nfs_fscache_get_super_cookie(sb, uniq, ulen);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index df4b87c30ac9..1e767f779c49 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1695,7 +1695,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
.callback_ops = call_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags,
+ .flags = RPC_TASK_ASYNC | flags,
.priority = priority,
};
/* Set up the initial task struct. */
@@ -1813,7 +1813,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
nfs_init_commit(data, head, NULL, cinfo);
atomic_inc(&cinfo->mds->rpcs_out);
return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
- data->mds_ops, how, 0);
+ data->mds_ops, how, RPC_TASK_CRED_NOREF);
}
/*
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c3b11a715082..5cf91322de0f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1312,6 +1312,7 @@ nfsd4_run_cb_work(struct work_struct *work)
container_of(work, struct nfsd4_callback, cb_work);
struct nfs4_client *clp = cb->cb_clp;
struct rpc_clnt *clnt;
+ int flags;
if (cb->cb_need_restart) {
cb->cb_need_restart = false;
@@ -1340,7 +1341,8 @@ nfsd4_run_cb_work(struct work_struct *work)
}
cb->cb_msg.rpc_cred = clp->cl_cb_cred;
- rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+ flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
+ rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,
cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index a8fb18609146..9e40dfecf1b1 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -127,16 +127,8 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
goto out;
}
- {
- SHASH_DESC_ON_STACK(desc, tfm);
-
- desc->tfm = tfm;
-
- status = crypto_shash_digest(desc, clname->data, clname->len,
- cksum.data);
- shash_desc_zero(desc);
- }
-
+ status = crypto_shash_tfm_digest(tfm, clname->data, clname->len,
+ cksum.data);
if (status)
goto out;
@@ -1148,7 +1140,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp)
struct crypto_shash *tfm = cn->cn_tfm;
struct xdr_netobj cksum;
char *principal = NULL;
- SHASH_DESC_ON_STACK(desc, tfm);
/* Don't upcall if it's already stored */
if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
@@ -1170,16 +1161,14 @@ nfsd4_cld_create_v2(struct nfs4_client *clp)
else if (clp->cl_cred.cr_principal)
principal = clp->cl_cred.cr_principal;
if (principal) {
- desc->tfm = tfm;
cksum.len = crypto_shash_digestsize(tfm);
cksum.data = kmalloc(cksum.len, GFP_KERNEL);
if (cksum.data == NULL) {
ret = -ENOMEM;
goto out;
}
- ret = crypto_shash_digest(desc, principal, strlen(principal),
- cksum.data);
- shash_desc_zero(desc);
+ ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal),
+ cksum.data);
if (ret) {
kfree(cksum.data);
goto out;
@@ -1343,7 +1332,6 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
struct crypto_shash *tfm = cn->cn_tfm;
struct xdr_netobj cksum;
char *principal = NULL;
- SHASH_DESC_ON_STACK(desc, tfm);
/* did we already find that this client is stable? */
if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
@@ -1381,14 +1369,12 @@ found:
principal = clp->cl_cred.cr_principal;
if (principal == NULL)
return -ENOENT;
- desc->tfm = tfm;
cksum.len = crypto_shash_digestsize(tfm);
cksum.data = kmalloc(cksum.len, GFP_KERNEL);
if (cksum.data == NULL)
return -ENOENT;
- status = crypto_shash_digest(desc, principal, strlen(principal),
- cksum.data);
- shash_desc_zero(desc);
+ status = crypto_shash_tfm_digest(tfm, principal,
+ strlen(principal), cksum.data);
if (status) {
kfree(cksum.data);
return -ENOENT;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e32ecedece0f..c107caa56525 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -267,6 +267,8 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
if (!nbl) {
nbl= kmalloc(sizeof(*nbl), GFP_KERNEL);
if (nbl) {
+ INIT_LIST_HEAD(&nbl->nbl_list);
+ INIT_LIST_HEAD(&nbl->nbl_lru);
fh_copy_shallow(&nbl->nbl_fh, fh);
locks_init_lock(&nbl->nbl_lock);
nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client,
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 5435a40f82be..c18459cea6f4 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -520,7 +520,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
mask = fanotify_group_event_mask(group, iter_info, mask, data,
data_type);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 8e4f1ace467c..1de77f1a600b 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -275,7 +275,6 @@ static ssize_t dlmfs_file_write(struct file *filp,
loff_t *ppos)
{
int bytes_left;
- ssize_t writelen;
char *lvb_buf;
struct inode *inode = file_inode(filp);
@@ -285,32 +284,30 @@ static ssize_t dlmfs_file_write(struct file *filp,
if (*ppos >= i_size_read(inode))
return -ENOSPC;
+ /* don't write past the lvb */
+ if (count > i_size_read(inode) - *ppos)
+ count = i_size_read(inode) - *ppos;
+
if (!count)
return 0;
if (!access_ok(buf, count))
return -EFAULT;
- /* don't write past the lvb */
- if ((count + *ppos) > i_size_read(inode))
- writelen = i_size_read(inode) - *ppos;
- else
- writelen = count - *ppos;
-
- lvb_buf = kmalloc(writelen, GFP_NOFS);
+ lvb_buf = kmalloc(count, GFP_NOFS);
if (!lvb_buf)
return -ENOMEM;
- bytes_left = copy_from_user(lvb_buf, buf, writelen);
- writelen -= bytes_left;
- if (writelen)
- user_dlm_write_lvb(inode, lvb_buf, writelen);
+ bytes_left = copy_from_user(lvb_buf, buf, count);
+ count -= bytes_left;
+ if (count)
+ user_dlm_write_lvb(inode, lvb_buf, count);
kfree(lvb_buf);
- *ppos = *ppos + writelen;
- mlog(0, "wrote %zd bytes\n", writelen);
- return writelen;
+ *ppos = *ppos + count;
+ mlog(0, "wrote %zu bytes\n", count);
+ return count;
}
static void dlmfs_init_once(void *foo)
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 475c61f53f0f..ed5c1078919c 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -783,6 +783,9 @@ static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type)
if (fh_type != OVL_FILEID_V0)
return ERR_PTR(-EINVAL);
+ if (buflen <= OVL_FH_WIRE_OFFSET)
+ return ERR_PTR(-EINVAL);
+
fh = kzalloc(buflen, GFP_KERNEL);
if (!fh)
return ERR_PTR(-ENOMEM);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index b0d42ece4d7c..981f11ec51bc 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -58,6 +58,24 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
attr->ia_valid &= ~ATTR_MODE;
+ /*
+ * We might have to translate ovl file into real file object
+ * once use cases emerge. For now, simply don't let underlying
+ * filesystem rely on attr->ia_file
+ */
+ attr->ia_valid &= ~ATTR_FILE;
+
+ /*
+ * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN
+ * set. Overlayfs does not pass O_TRUNC flag to underlying
+ * filesystem during open -> do not pass ATTR_OPEN. This
+ * disables optimization in fuse which assumes open(O_TRUNC)
+ * already set file size to 0. But we never passed O_TRUNC to
+ * fuse. So by clearing ATTR_OPEN, fuse will be forced to send
+ * setattr request to server.
+ */
+ attr->ia_valid &= ~ATTR_OPEN;
+
inode_lock(upperdentry->d_inode);
old_cred = ovl_override_creds(dentry->d_sb);
err = notify_change(upperdentry, attr, NULL);
diff --git a/fs/pnode.c b/fs/pnode.c
index 49f6d7ff2139..1106137c747a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -261,14 +261,13 @@ static int propagate_one(struct mount *m)
child = copy_tree(last_source, last_source->mnt.mnt_root, type);
if (IS_ERR(child))
return PTR_ERR(child);
+ read_seqlock_excl(&mount_lock);
mnt_set_mountpoint(m, mp, child);
+ if (m->mnt_master != dest_master)
+ SET_MNT_MARK(m->mnt_master);
+ read_sequnlock_excl(&mount_lock);
last_dest = m;
last_source = child;
- if (m->mnt_master != dest_master) {
- read_seqlock_excl(&mount_lock);
- SET_MNT_MARK(m->mnt_master);
- read_sequnlock_excl(&mount_lock);
- }
hlist_add_head(&child->mnt_hash, list);
return count_mounts(m->mnt_ns, child);
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6042b646ab27..eb2255e95f62 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1573,6 +1573,7 @@ static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
noffsets = 0;
for (pos = kbuf; pos; pos = next_line) {
struct proc_timens_offset *off = &offsets[noffsets];
+ char clock[10];
int err;
/* Find the end of line and ensure we don't look past it */
@@ -1584,10 +1585,21 @@ static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
next_line = NULL;
}
- err = sscanf(pos, "%u %lld %lu", &off->clockid,
+ err = sscanf(pos, "%9s %lld %lu", clock,
&off->val.tv_sec, &off->val.tv_nsec);
if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
goto out;
+
+ clock[sizeof(clock) - 1] = 0;
+ if (strcmp(clock, "monotonic") == 0 ||
+ strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
+ off->clockid = CLOCK_MONOTONIC;
+ else if (strcmp(clock, "boottime") == 0 ||
+ strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
+ off->clockid = CLOCK_BOOTTIME;
+ else
+ goto out;
+
noffsets++;
if (noffsets == ARRAY_SIZE(offsets)) {
if (next_line)
@@ -3274,7 +3286,6 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
void proc_flush_pid(struct pid *pid)
{
proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
- put_pid(pid);
}
static struct dentry *proc_pid_instantiate(struct dentry * dentry,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8c1f1bb1a5ce..09cd51c8d23d 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -103,6 +103,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SUnreclaim: ", sunreclaim);
seq_printf(m, "KernelStack: %8lu kB\n",
global_zone_page_state(NR_KERNEL_STACK_KB));
+#ifdef CONFIG_SHADOW_CALL_STACK
+ seq_printf(m, "ShadowCallStack:%8lu kB\n",
+ global_zone_page_state(NR_KERNEL_SCS_KB));
+#endif
show_val_kb(m, "PageTables: ",
global_zone_page_state(NR_PAGETABLE));
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 2633f10446c3..cdbe9293ea55 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -196,6 +196,13 @@ static void proc_kill_sb(struct super_block *sb)
if (ns->proc_thread_self)
dput(ns->proc_thread_self);
kill_anon_super(sb);
+
+ /* Make the pid namespace safe for the next mount of proc */
+ ns->proc_self = NULL;
+ ns->proc_thread_self = NULL;
+ ns->pid_gid = GLOBAL_ROOT_GID;
+ ns->hide_pid = 0;
+
put_pid_ns(ns);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8d382d4ec067..10a6d472397f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -622,9 +622,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_GROWSDOWN)] = "gd",
[ilog2(VM_PFNMAP)] = "pf",
[ilog2(VM_DENYWRITE)] = "dw",
-#ifdef CONFIG_X86_INTEL_MPX
- [ilog2(VM_MPX)] = "mp",
-#endif
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
@@ -638,6 +635,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_ARCH_1)] = "ar",
[ilog2(VM_WIPEONFORK)] = "wf",
[ilog2(VM_DONTDUMP)] = "dd",
+#ifdef CONFIG_ARM64_BTI
+ [ilog2(VM_ARM64_BTI)] = "bt",
+#endif
#ifdef CONFIG_MEM_SOFT_DIRTY
[ilog2(VM_SOFTDIRTY)] = "sd",
#endif
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7dc800cce354..c663202da8de 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -266,7 +266,8 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
if (start < offset + dump->size) {
tsz = min(offset + (u64)dump->size - start, (u64)size);
buf = dump->buf + start - offset;
- if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) {
+ if (remap_vmalloc_range_partial(vma, dst, buf, 0,
+ tsz)) {
ret = -EFAULT;
goto out_unlock;
}
@@ -624,7 +625,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz;
if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
- kaddr, tsz))
+ kaddr, 0, tsz))
goto fail;
size -= tsz;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 8f0369aad22a..e16a49ebfe54 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -153,3 +153,112 @@ config PSTORE_RAM
"ramoops.ko".
For more information, see Documentation/admin-guide/ramoops.rst.
+
+config PSTORE_ZONE
+ tristate
+ depends on PSTORE
+ help
+ The common layer for pstore/blk (and pstore/ram in the future)
+ to manage storage in zones.
+
+config PSTORE_BLK
+ tristate "Log panic/oops to a block device"
+ depends on PSTORE
+ depends on BLOCK
+ select PSTORE_ZONE
+ default n
+ help
+ This enables panic and oops message to be logged to a block dev
+ where it can be read back at some later point.
+
+ For more information, see Documentation/admin-guide/pstore-blk.rst
+
+ If unsure, say N.
+
+config PSTORE_BLK_BLKDEV
+ string "block device identifier"
+ depends on PSTORE_BLK
+ default ""
+ help
+ Which block device should be used for pstore/blk.
+
+ It accepts the following variants:
+ 1) <hex_major><hex_minor> device number in hexadecimal representation,
+ with no leading 0x, for example b302.
+ 2) /dev/<disk_name> represents the device name of disk
+ 3) /dev/<disk_name><decimal> represents the device name and number
+ of partition - device number of disk plus the partition number
+ 4) /dev/<disk_name>p<decimal> - same as the above, this form is
+ used when disk name of partitioned disk ends with a digit.
+ 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
+ unique id of a partition if the partition table provides it.
+ The UUID may be either an EFI/GPT UUID, or refer to an MSDOS
+ partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero-
+ filled hex representation of the 32-bit "NT disk signature", and PP
+ is a zero-filled hex representation of the 1-based partition number.
+ 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation
+ to a partition with a known unique id.
+ 7) <major>:<minor> major and minor number of the device separated by
+ a colon.
+
+ NOTE that, both Kconfig and module parameters can configure
+ pstore/blk, but module parameters have priority over Kconfig.
+
+config PSTORE_BLK_KMSG_SIZE
+ int "Size in Kbytes of kmsg dump log to store"
+ depends on PSTORE_BLK
+ default 64
+ help
+ This just sets size of kmsg dump (oops, panic, etc) log for
+ pstore/blk. The size is in KB and must be a multiple of 4.
+
+ NOTE that, both Kconfig and module parameters can configure
+ pstore/blk, but module parameters have priority over Kconfig.
+
+config PSTORE_BLK_MAX_REASON
+ int "Maximum kmsg dump reason to store"
+ depends on PSTORE_BLK
+ default 2
+ help
+ The maximum reason for kmsg dumps to store. The default is
+ 2 (KMSG_DUMP_OOPS), see include/linux/kmsg_dump.h's
+ enum kmsg_dump_reason for more details.
+
+ NOTE that, both Kconfig and module parameters can configure
+ pstore/blk, but module parameters have priority over Kconfig.
+
+config PSTORE_BLK_PMSG_SIZE
+ int "Size in Kbytes of pmsg to store"
+ depends on PSTORE_BLK
+ depends on PSTORE_PMSG
+ default 64
+ help
+ This just sets size of pmsg (pmsg_size) for pstore/blk. The size is
+ in KB and must be a multiple of 4.
+
+ NOTE that, both Kconfig and module parameters can configure
+ pstore/blk, but module parameters have priority over Kconfig.
+
+config PSTORE_BLK_CONSOLE_SIZE
+ int "Size in Kbytes of console log to store"
+ depends on PSTORE_BLK
+ depends on PSTORE_CONSOLE
+ default 64
+ help
+ This just sets size of console log (console_size) to store via
+ pstore/blk. The size is in KB and must be a multiple of 4.
+
+ NOTE that, both Kconfig and module parameters can configure
+ pstore/blk, but module parameters have priority over Kconfig.
+
+config PSTORE_BLK_FTRACE_SIZE
+ int "Size in Kbytes of ftrace log to store"
+ depends on PSTORE_BLK
+ depends on PSTORE_FTRACE
+ default 64
+ help
+ This just sets size of ftrace log (ftrace_size) for pstore/blk. The
+ size is in KB and must be a multiple of 4.
+
+ NOTE that, both Kconfig and module parameters can configure
+ pstore/blk, but module parameters have priority over Kconfig.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 967b5891f325..c270467aeece 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -12,3 +12,9 @@ pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o
ramoops-objs += ram.o ram_core.o
obj-$(CONFIG_PSTORE_RAM) += ramoops.o
+
+pstore_zone-objs += zone.o
+obj-$(CONFIG_PSTORE_ZONE) += pstore_zone.o
+
+pstore_blk-objs += blk.o
+obj-$(CONFIG_PSTORE_BLK) += pstore_blk.o
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
new file mode 100644
index 000000000000..fcd5563dde06
--- /dev/null
+++ b/fs/pstore/blk.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implements pstore backend driver that write to block (or non-block) storage
+ * devices, using the pstore/zone API.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include "../../block/blk.h"
+#include <linux/blkdev.h>
+#include <linux/string.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/pstore_blk.h>
+#include <linux/mount.h>
+#include <linux/uio.h>
+
+static long kmsg_size = CONFIG_PSTORE_BLK_KMSG_SIZE;
+module_param(kmsg_size, long, 0400);
+MODULE_PARM_DESC(kmsg_size, "kmsg dump record size in kbytes");
+
+static int max_reason = CONFIG_PSTORE_BLK_MAX_REASON;
+module_param(max_reason, int, 0400);
+MODULE_PARM_DESC(max_reason,
+ "maximum reason for kmsg dump (default 2: Oops and Panic)");
+
+#if IS_ENABLED(CONFIG_PSTORE_PMSG)
+static long pmsg_size = CONFIG_PSTORE_BLK_PMSG_SIZE;
+#else
+static long pmsg_size = -1;
+#endif
+module_param(pmsg_size, long, 0400);
+MODULE_PARM_DESC(pmsg_size, "pmsg size in kbytes");
+
+#if IS_ENABLED(CONFIG_PSTORE_CONSOLE)
+static long console_size = CONFIG_PSTORE_BLK_CONSOLE_SIZE;
+#else
+static long console_size = -1;
+#endif
+module_param(console_size, long, 0400);
+MODULE_PARM_DESC(console_size, "console size in kbytes");
+
+#if IS_ENABLED(CONFIG_PSTORE_FTRACE)
+static long ftrace_size = CONFIG_PSTORE_BLK_FTRACE_SIZE;
+#else
+static long ftrace_size = -1;
+#endif
+module_param(ftrace_size, long, 0400);
+MODULE_PARM_DESC(ftrace_size, "ftrace size in kbytes");
+
+static bool best_effort;
+module_param(best_effort, bool, 0400);
+MODULE_PARM_DESC(best_effort, "use best effort to write (i.e. do not require storage driver pstore support, default: off)");
+
+/*
+ * blkdev - the block device to use for pstore storage
+ *
+ * Usually, this will be a partition of a block device.
+ *
+ * blkdev accepts the following variants:
+ * 1) <hex_major><hex_minor> device number in hexadecimal representation,
+ * with no leading 0x, for example b302.
+ * 2) /dev/<disk_name> represents the device number of disk
+ * 3) /dev/<disk_name><decimal> represents the device number
+ * of partition - device number of disk plus the partition number
+ * 4) /dev/<disk_name>p<decimal> - same as the above, that form is
+ * used when disk name of partitioned disk ends on a digit.
+ * 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
+ * unique id of a partition if the partition table provides it.
+ * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS
+ * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero-
+ * filled hex representation of the 32-bit "NT disk signature", and PP
+ * is a zero-filled hex representation of the 1-based partition number.
+ * 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to
+ * a partition with a known unique id.
+ * 7) <major>:<minor> major and minor number of the device separated by
+ * a colon.
+ */
+static char blkdev[80] = CONFIG_PSTORE_BLK_BLKDEV;
+module_param_string(blkdev, blkdev, 80, 0400);
+MODULE_PARM_DESC(blkdev, "block device for pstore storage");
+
+/*
+ * All globals must only be accessed under the pstore_blk_lock
+ * during the register/unregister functions.
+ */
+static DEFINE_MUTEX(pstore_blk_lock);
+static struct block_device *psblk_bdev;
+static struct pstore_zone_info *pstore_zone_info;
+static pstore_blk_panic_write_op blkdev_panic_write;
+
+struct bdev_info {
+ dev_t devt;
+ sector_t nr_sects;
+ sector_t start_sect;
+};
+
+#define check_size(name, alignsize) ({ \
+ long _##name_ = (name); \
+ _##name_ = _##name_ <= 0 ? 0 : (_##name_ * 1024); \
+ if (_##name_ & ((alignsize) - 1)) { \
+ pr_info(#name " must align to %d\n", \
+ (alignsize)); \
+ _##name_ = ALIGN(name, (alignsize)); \
+ } \
+ _##name_; \
+})
+
+static int __register_pstore_device(struct pstore_device_info *dev)
+{
+ int ret;
+
+ lockdep_assert_held(&pstore_blk_lock);
+
+ if (!dev || !dev->total_size || !dev->read || !dev->write)
+ return -EINVAL;
+
+ /* someone already registered before */
+ if (pstore_zone_info)
+ return -EBUSY;
+
+ pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL);
+ if (!pstore_zone_info)
+ return -ENOMEM;
+
+ /* zero means not limit on which backends to attempt to store. */
+ if (!dev->flags)
+ dev->flags = UINT_MAX;
+
+#define verify_size(name, alignsize, enabled) { \
+ long _##name_; \
+ if (enabled) \
+ _##name_ = check_size(name, alignsize); \
+ else \
+ _##name_ = 0; \
+ name = _##name_ / 1024; \
+ pstore_zone_info->name = _##name_; \
+ }
+
+ verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG);
+ verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG);
+ verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE);
+ verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE);
+#undef verify_size
+
+ pstore_zone_info->total_size = dev->total_size;
+ pstore_zone_info->max_reason = max_reason;
+ pstore_zone_info->read = dev->read;
+ pstore_zone_info->write = dev->write;
+ pstore_zone_info->erase = dev->erase;
+ pstore_zone_info->panic_write = dev->panic_write;
+ pstore_zone_info->name = KBUILD_MODNAME;
+ pstore_zone_info->owner = THIS_MODULE;
+
+ ret = register_pstore_zone(pstore_zone_info);
+ if (ret) {
+ kfree(pstore_zone_info);
+ pstore_zone_info = NULL;
+ }
+ return ret;
+}
+/**
+ * register_pstore_device() - register non-block device to pstore/blk
+ *
+ * @dev: non-block device information
+ *
+ * Return:
+ * * 0 - OK
+ * * Others - something error.
+ */
+int register_pstore_device(struct pstore_device_info *dev)
+{
+ int ret;
+
+ mutex_lock(&pstore_blk_lock);
+ ret = __register_pstore_device(dev);
+ mutex_unlock(&pstore_blk_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_pstore_device);
+
+static void __unregister_pstore_device(struct pstore_device_info *dev)
+{
+ lockdep_assert_held(&pstore_blk_lock);
+ if (pstore_zone_info && pstore_zone_info->read == dev->read) {
+ unregister_pstore_zone(pstore_zone_info);
+ kfree(pstore_zone_info);
+ pstore_zone_info = NULL;
+ }
+}
+
+/**
+ * unregister_pstore_device() - unregister non-block device from pstore/blk
+ *
+ * @dev: non-block device information
+ */
+void unregister_pstore_device(struct pstore_device_info *dev)
+{
+ mutex_lock(&pstore_blk_lock);
+ __unregister_pstore_device(dev);
+ mutex_unlock(&pstore_blk_lock);
+}
+EXPORT_SYMBOL_GPL(unregister_pstore_device);
+
+/**
+ * psblk_get_bdev() - open block device
+ *
+ * @holder: Exclusive holder identifier
+ * @info: Information about bdev to fill in
+ *
+ * Return: pointer to block device on success and others on error.
+ *
+ * On success, the returned block_device has reference count of one.
+ */
+static struct block_device *psblk_get_bdev(void *holder,
+ struct bdev_info *info)
+{
+ struct block_device *bdev = ERR_PTR(-ENODEV);
+ fmode_t mode = FMODE_READ | FMODE_WRITE;
+ sector_t nr_sects;
+
+ lockdep_assert_held(&pstore_blk_lock);
+
+ if (pstore_zone_info)
+ return ERR_PTR(-EBUSY);
+
+ if (!blkdev[0])
+ return ERR_PTR(-ENODEV);
+
+ if (holder)
+ mode |= FMODE_EXCL;
+ bdev = blkdev_get_by_path(blkdev, mode, holder);
+ if (IS_ERR(bdev)) {
+ dev_t devt;
+
+ devt = name_to_dev_t(blkdev);
+ if (devt == 0)
+ return ERR_PTR(-ENODEV);
+ bdev = blkdev_get_by_dev(devt, mode, holder);
+ if (IS_ERR(bdev))
+ return bdev;
+ }
+
+ nr_sects = part_nr_sects_read(bdev->bd_part);
+ if (!nr_sects) {
+ pr_err("not enough space for '%s'\n", blkdev);
+ blkdev_put(bdev, mode);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ if (info) {
+ info->devt = bdev->bd_dev;
+ info->nr_sects = nr_sects;
+ info->start_sect = get_start_sect(bdev);
+ }
+
+ return bdev;
+}
+
+static void psblk_put_bdev(struct block_device *bdev, void *holder)
+{
+ fmode_t mode = FMODE_READ | FMODE_WRITE;
+
+ lockdep_assert_held(&pstore_blk_lock);
+
+ if (!bdev)
+ return;
+
+ if (holder)
+ mode |= FMODE_EXCL;
+ blkdev_put(bdev, mode);
+}
+
+static ssize_t psblk_generic_blk_read(char *buf, size_t bytes, loff_t pos)
+{
+ struct block_device *bdev = psblk_bdev;
+ struct file file;
+ struct kiocb kiocb;
+ struct iov_iter iter;
+ struct kvec iov = {.iov_base = buf, .iov_len = bytes};
+
+ if (!bdev)
+ return -ENODEV;
+
+ memset(&file, 0, sizeof(struct file));
+ file.f_mapping = bdev->bd_inode->i_mapping;
+ file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME;
+ file.f_inode = bdev->bd_inode;
+ file_ra_state_init(&file.f_ra, file.f_mapping);
+
+ init_sync_kiocb(&kiocb, &file);
+ kiocb.ki_pos = pos;
+ iov_iter_kvec(&iter, READ, &iov, 1, bytes);
+
+ return generic_file_read_iter(&kiocb, &iter);
+}
+
+static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
+ loff_t pos)
+{
+ struct block_device *bdev = psblk_bdev;
+ struct iov_iter iter;
+ struct kiocb kiocb;
+ struct file file;
+ ssize_t ret;
+ struct kvec iov = {.iov_base = (void *)buf, .iov_len = bytes};
+
+ if (!bdev)
+ return -ENODEV;
+
+ /* Console/Ftrace backend may handle buffer until flush dirty zones */
+ if (in_interrupt() || irqs_disabled())
+ return -EBUSY;
+
+ memset(&file, 0, sizeof(struct file));
+ file.f_mapping = bdev->bd_inode->i_mapping;
+ file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME;
+ file.f_inode = bdev->bd_inode;
+
+ init_sync_kiocb(&kiocb, &file);
+ kiocb.ki_pos = pos;
+ iov_iter_kvec(&iter, WRITE, &iov, 1, bytes);
+
+ inode_lock(bdev->bd_inode);
+ ret = generic_write_checks(&kiocb, &iter);
+ if (ret > 0)
+ ret = generic_perform_write(&file, &iter, pos);
+ inode_unlock(bdev->bd_inode);
+
+ if (likely(ret > 0)) {
+ const struct file_operations f_op = {.fsync = blkdev_fsync};
+
+ file.f_op = &f_op;
+ kiocb.ki_pos += ret;
+ ret = generic_write_sync(&kiocb, ret);
+ }
+ return ret;
+}
+
+static ssize_t psblk_blk_panic_write(const char *buf, size_t size,
+ loff_t off)
+{
+ int ret;
+
+ if (!blkdev_panic_write)
+ return -EOPNOTSUPP;
+
+ /* size and off must align to SECTOR_SIZE for block device */
+ ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT,
+ size >> SECTOR_SHIFT);
+ /* try next zone */
+ if (ret == -ENOMSG)
+ return ret;
+ return ret ? -EIO : size;
+}
+
+static int __register_pstore_blk(struct pstore_blk_info *info)
+{
+ char bdev_name[BDEVNAME_SIZE];
+ struct block_device *bdev;
+ struct pstore_device_info dev;
+ struct bdev_info binfo;
+ void *holder = blkdev;
+ int ret = -ENODEV;
+
+ lockdep_assert_held(&pstore_blk_lock);
+
+ /* hold bdev exclusively */
+ memset(&binfo, 0, sizeof(binfo));
+ bdev = psblk_get_bdev(holder, &binfo);
+ if (IS_ERR(bdev)) {
+ pr_err("failed to open '%s'!\n", blkdev);
+ return PTR_ERR(bdev);
+ }
+
+ /* only allow driver matching the @blkdev */
+ if (!binfo.devt || (!best_effort &&
+ MAJOR(binfo.devt) != info->major)) {
+ pr_debug("invalid major %u (expect %u)\n",
+ info->major, MAJOR(binfo.devt));
+ ret = -ENODEV;
+ goto err_put_bdev;
+ }
+
+ /* psblk_bdev must be assigned before register to pstore/blk */
+ psblk_bdev = bdev;
+ blkdev_panic_write = info->panic_write;
+
+ /* Copy back block device details. */
+ info->devt = binfo.devt;
+ info->nr_sects = binfo.nr_sects;
+ info->start_sect = binfo.start_sect;
+
+ memset(&dev, 0, sizeof(dev));
+ dev.total_size = info->nr_sects << SECTOR_SHIFT;
+ dev.flags = info->flags;
+ dev.read = psblk_generic_blk_read;
+ dev.write = psblk_generic_blk_write;
+ dev.erase = NULL;
+ dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL;
+
+ ret = __register_pstore_device(&dev);
+ if (ret)
+ goto err_put_bdev;
+
+ bdevname(bdev, bdev_name);
+ pr_info("attached %s%s\n", bdev_name,
+ info->panic_write ? "" : " (no dedicated panic_write!)");
+ return 0;
+
+err_put_bdev:
+ psblk_bdev = NULL;
+ blkdev_panic_write = NULL;
+ psblk_put_bdev(bdev, holder);
+ return ret;
+}
+
+/**
+ * register_pstore_blk() - register block device to pstore/blk
+ *
+ * @info: details on the desired block device interface
+ *
+ * Return:
+ * * 0 - OK
+ * * Others - something error.
+ */
+int register_pstore_blk(struct pstore_blk_info *info)
+{
+ int ret;
+
+ mutex_lock(&pstore_blk_lock);
+ ret = __register_pstore_blk(info);
+ mutex_unlock(&pstore_blk_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_pstore_blk);
+
+static void __unregister_pstore_blk(unsigned int major)
+{
+ struct pstore_device_info dev = { .read = psblk_generic_blk_read };
+ void *holder = blkdev;
+
+ lockdep_assert_held(&pstore_blk_lock);
+ if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) {
+ __unregister_pstore_device(&dev);
+ psblk_put_bdev(psblk_bdev, holder);
+ blkdev_panic_write = NULL;
+ psblk_bdev = NULL;
+ }
+}
+
+/**
+ * unregister_pstore_blk() - unregister block device from pstore/blk
+ *
+ * @major: the major device number of device
+ */
+void unregister_pstore_blk(unsigned int major)
+{
+ mutex_lock(&pstore_blk_lock);
+ __unregister_pstore_blk(major);
+ mutex_unlock(&pstore_blk_lock);
+}
+EXPORT_SYMBOL_GPL(unregister_pstore_blk);
+
+/* get information of pstore/blk */
+int pstore_blk_get_config(struct pstore_blk_config *info)
+{
+ strncpy(info->device, blkdev, 80);
+ info->max_reason = max_reason;
+ info->kmsg_size = check_size(kmsg_size, 4096);
+ info->pmsg_size = check_size(pmsg_size, 4096);
+ info->ftrace_size = check_size(ftrace_size, 4096);
+ info->console_size = check_size(console_size, 4096);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_blk_get_config);
+
+static int __init pstore_blk_init(void)
+{
+ struct pstore_blk_info info = { };
+ int ret = 0;
+
+ mutex_lock(&pstore_blk_lock);
+ if (!pstore_zone_info && best_effort && blkdev[0])
+ ret = __register_pstore_blk(&info);
+ mutex_unlock(&pstore_blk_lock);
+
+ return ret;
+}
+late_initcall(pstore_blk_init);
+
+static void __exit pstore_blk_exit(void)
+{
+ mutex_lock(&pstore_blk_lock);
+ if (psblk_bdev)
+ __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev));
+ else {
+ struct pstore_device_info dev = { };
+
+ if (pstore_zone_info)
+ dev.read = pstore_zone_info->read;
+ __unregister_pstore_device(&dev);
+ }
+ mutex_unlock(&pstore_blk_lock);
+}
+module_exit(pstore_blk_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("WeiXiong Liao <liaoweixiong@allwinnertech.com>");
+MODULE_AUTHOR("Kees Cook <keescook@chromium.org>");
+MODULE_DESCRIPTION("pstore backend for block devices");
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index bfbfc2698070..5c0450701293 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -16,6 +16,7 @@
#include <linux/debugfs.h>
#include <linux/err.h>
#include <linux/cache.h>
+#include <linux/slab.h>
#include <asm/barrier.h>
#include "internal.h"
@@ -132,3 +133,56 @@ void pstore_unregister_ftrace(void)
debugfs_remove_recursive(pstore_ftrace_dir);
}
+
+ssize_t pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size,
+ const char *src_log, size_t src_log_size)
+{
+ size_t dest_size, src_size, total, dest_off, src_off;
+ size_t dest_idx = 0, src_idx = 0, merged_idx = 0;
+ void *merged_buf;
+ struct pstore_ftrace_record *drec, *srec, *mrec;
+ size_t record_size = sizeof(struct pstore_ftrace_record);
+
+ dest_off = *dest_log_size % record_size;
+ dest_size = *dest_log_size - dest_off;
+
+ src_off = src_log_size % record_size;
+ src_size = src_log_size - src_off;
+
+ total = dest_size + src_size;
+ merged_buf = kmalloc(total, GFP_KERNEL);
+ if (!merged_buf)
+ return -ENOMEM;
+
+ drec = (struct pstore_ftrace_record *)(*dest_log + dest_off);
+ srec = (struct pstore_ftrace_record *)(src_log + src_off);
+ mrec = (struct pstore_ftrace_record *)(merged_buf);
+
+ while (dest_size > 0 && src_size > 0) {
+ if (pstore_ftrace_read_timestamp(&drec[dest_idx]) <
+ pstore_ftrace_read_timestamp(&srec[src_idx])) {
+ mrec[merged_idx++] = drec[dest_idx++];
+ dest_size -= record_size;
+ } else {
+ mrec[merged_idx++] = srec[src_idx++];
+ src_size -= record_size;
+ }
+ }
+
+ while (dest_size > 0) {
+ mrec[merged_idx++] = drec[dest_idx++];
+ dest_size -= record_size;
+ }
+
+ while (src_size > 0) {
+ mrec[merged_idx++] = srec[src_idx++];
+ src_size -= record_size;
+ }
+
+ kfree(*dest_log);
+ *dest_log = merged_buf;
+ *dest_log_size = total;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_ftrace_combine_log);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d99b5d39aa90..c331efe8de95 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -22,18 +22,21 @@
#include <linux/magic.h>
#include <linux/pstore.h>
#include <linux/slab.h>
-#include <linux/spinlock.h>
#include <linux/uaccess.h>
#include "internal.h"
#define PSTORE_NAMELEN 64
-static DEFINE_SPINLOCK(allpstore_lock);
-static LIST_HEAD(allpstore);
+static DEFINE_MUTEX(records_list_lock);
+static LIST_HEAD(records_list);
+
+static DEFINE_MUTEX(pstore_sb_lock);
+static struct super_block *pstore_sb;
struct pstore_private {
struct list_head list;
+ struct dentry *dentry;
struct pstore_record *record;
size_t total_size;
};
@@ -178,10 +181,22 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
{
struct pstore_private *p = d_inode(dentry)->i_private;
struct pstore_record *record = p->record;
+ int rc = 0;
if (!record->psi->erase)
return -EPERM;
+ /* Make sure we can't race while removing this file. */
+ mutex_lock(&records_list_lock);
+ if (!list_empty(&p->list))
+ list_del_init(&p->list);
+ else
+ rc = -ENOENT;
+ p->dentry = NULL;
+ mutex_unlock(&records_list_lock);
+ if (rc)
+ return rc;
+
mutex_lock(&record->psi->read_mutex);
record->psi->erase(record);
mutex_unlock(&record->psi->read_mutex);
@@ -192,15 +207,9 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
static void pstore_evict_inode(struct inode *inode)
{
struct pstore_private *p = inode->i_private;
- unsigned long flags;
clear_inode(inode);
- if (p) {
- spin_lock_irqsave(&allpstore_lock, flags);
- list_del(&p->list);
- spin_unlock_irqrestore(&allpstore_lock, flags);
- free_pstore_private(p);
- }
+ free_pstore_private(p);
}
static const struct inode_operations pstore_dir_inode_operations = {
@@ -278,11 +287,54 @@ static const struct super_operations pstore_ops = {
.show_options = pstore_show_options,
};
-static struct super_block *pstore_sb;
+static struct dentry *psinfo_lock_root(void)
+{
+ struct dentry *root;
-bool pstore_is_mounted(void)
+ mutex_lock(&pstore_sb_lock);
+ /*
+ * Having no backend is fine -- no records appear.
+ * Not being mounted is fine -- nothing to do.
+ */
+ if (!psinfo || !pstore_sb) {
+ mutex_unlock(&pstore_sb_lock);
+ return NULL;
+ }
+
+ root = pstore_sb->s_root;
+ inode_lock(d_inode(root));
+ mutex_unlock(&pstore_sb_lock);
+
+ return root;
+}
+
+int pstore_put_backend_records(struct pstore_info *psi)
{
- return pstore_sb != NULL;
+ struct pstore_private *pos, *tmp;
+ struct dentry *root;
+ int rc = 0;
+
+ root = psinfo_lock_root();
+ if (!root)
+ return 0;
+
+ mutex_lock(&records_list_lock);
+ list_for_each_entry_safe(pos, tmp, &records_list, list) {
+ if (pos->record->psi == psi) {
+ list_del_init(&pos->list);
+ rc = simple_unlink(d_inode(root), pos->dentry);
+ if (WARN_ON(rc))
+ break;
+ d_drop(pos->dentry);
+ dput(pos->dentry);
+ pos->dentry = NULL;
+ }
+ }
+ mutex_unlock(&records_list_lock);
+
+ inode_unlock(d_inode(root));
+
+ return rc;
}
/*
@@ -297,23 +349,20 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
int rc = 0;
char name[PSTORE_NAMELEN];
struct pstore_private *private, *pos;
- unsigned long flags;
size_t size = record->size + record->ecc_notice_size;
- WARN_ON(!inode_is_locked(d_inode(root)));
+ if (WARN_ON(!inode_is_locked(d_inode(root))))
+ return -EINVAL;
- spin_lock_irqsave(&allpstore_lock, flags);
- list_for_each_entry(pos, &allpstore, list) {
+ rc = -EEXIST;
+ /* Skip records that are already present in the filesystem. */
+ mutex_lock(&records_list_lock);
+ list_for_each_entry(pos, &records_list, list) {
if (pos->record->type == record->type &&
pos->record->id == record->id &&
- pos->record->psi == record->psi) {
- rc = -EEXIST;
- break;
- }
+ pos->record->psi == record->psi)
+ goto fail;
}
- spin_unlock_irqrestore(&allpstore_lock, flags);
- if (rc)
- return rc;
rc = -ENOMEM;
inode = pstore_get_inode(root->d_sb);
@@ -334,6 +383,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
if (!dentry)
goto fail_private;
+ private->dentry = dentry;
private->record = record;
inode->i_size = private->total_size = size;
inode->i_private = private;
@@ -343,9 +393,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
d_add(dentry, inode);
- spin_lock_irqsave(&allpstore_lock, flags);
- list_add(&private->list, &allpstore);
- spin_unlock_irqrestore(&allpstore_lock, flags);
+ list_add(&private->list, &records_list);
+ mutex_unlock(&records_list_lock);
return 0;
@@ -353,8 +402,8 @@ fail_private:
free_pstore_private(private);
fail_inode:
iput(inode);
-
fail:
+ mutex_unlock(&records_list_lock);
return rc;
}
@@ -366,16 +415,13 @@ fail:
*/
void pstore_get_records(int quiet)
{
- struct pstore_info *psi = psinfo;
struct dentry *root;
- if (!psi || !pstore_sb)
+ root = psinfo_lock_root();
+ if (!root)
return;
- root = pstore_sb->s_root;
-
- inode_lock(d_inode(root));
- pstore_get_backend_records(psi, root, quiet);
+ pstore_get_backend_records(psinfo, root, quiet);
inode_unlock(d_inode(root));
}
@@ -383,8 +429,6 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
- pstore_sb = sb;
-
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
@@ -405,6 +449,10 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
if (!sb->s_root)
return -ENOMEM;
+ mutex_lock(&pstore_sb_lock);
+ pstore_sb = sb;
+ mutex_unlock(&pstore_sb_lock);
+
pstore_get_records(0);
return 0;
@@ -418,8 +466,17 @@ static struct dentry *pstore_mount(struct file_system_type *fs_type,
static void pstore_kill_sb(struct super_block *sb)
{
+ mutex_lock(&pstore_sb_lock);
+ WARN_ON(pstore_sb != sb);
+
kill_litter_super(sb);
pstore_sb = NULL;
+
+ mutex_lock(&records_list_lock);
+ INIT_LIST_HEAD(&records_list);
+ mutex_unlock(&records_list_lock);
+
+ mutex_unlock(&pstore_sb_lock);
}
static struct file_system_type pstore_fs_type = {
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 7062ea4bc57c..7fb219042f13 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -12,9 +12,18 @@ extern unsigned long kmsg_bytes;
#ifdef CONFIG_PSTORE_FTRACE
extern void pstore_register_ftrace(void);
extern void pstore_unregister_ftrace(void);
+ssize_t pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size,
+ const char *src_log, size_t src_log_size);
#else
static inline void pstore_register_ftrace(void) {}
static inline void pstore_unregister_ftrace(void) {}
+static inline ssize_t
+pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size,
+ const char *src_log, size_t src_log_size)
+{
+ *dest_log_size = 0;
+ return 0;
+}
#endif
#ifdef CONFIG_PSTORE_PMSG
@@ -31,9 +40,9 @@ extern void pstore_set_kmsg_bytes(int);
extern void pstore_get_records(int);
extern void pstore_get_backend_records(struct pstore_info *psi,
struct dentry *root, int quiet);
+extern int pstore_put_backend_records(struct pstore_info *psi);
extern int pstore_mkfile(struct dentry *root,
struct pstore_record *record);
-extern bool pstore_is_mounted(void);
extern void pstore_record_init(struct pstore_record *record,
struct pstore_info *psi);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 408277ee3cdb..a9e297eefdff 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -44,7 +44,7 @@ static int pstore_update_ms = -1;
module_param_named(update_ms, pstore_update_ms, int, 0600);
MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content "
"(default is -1, which means runtime updates are disabled; "
- "enabling this option is not safe, it may lead to further "
+ "enabling this option may not be safe; it may lead to further "
"corruption on Oopses)");
/* Names should be in the same order as the enum pstore_type_id */
@@ -69,19 +69,25 @@ static void pstore_dowork(struct work_struct *);
static DECLARE_WORK(pstore_work, pstore_dowork);
/*
- * pstore_lock just protects "psinfo" during
- * calls to pstore_register()
+ * psinfo_lock protects "psinfo" during calls to
+ * pstore_register(), pstore_unregister(), and
+ * the filesystem mount/unmount routines.
*/
-static DEFINE_SPINLOCK(pstore_lock);
+static DEFINE_MUTEX(psinfo_lock);
struct pstore_info *psinfo;
static char *backend;
+module_param(backend, charp, 0444);
+MODULE_PARM_DESC(backend, "specific backend to use");
+
static char *compress =
#ifdef CONFIG_PSTORE_COMPRESS_DEFAULT
CONFIG_PSTORE_COMPRESS_DEFAULT;
#else
NULL;
#endif
+module_param(compress, charp, 0444);
+MODULE_PARM_DESC(compress, "compression to use");
/* Compression parameters */
static struct crypto_comp *tfm;
@@ -129,24 +135,12 @@ enum pstore_type_id pstore_name_to_type(const char *name)
}
EXPORT_SYMBOL_GPL(pstore_name_to_type);
-static const char *get_reason_str(enum kmsg_dump_reason reason)
+static void pstore_timer_kick(void)
{
- switch (reason) {
- case KMSG_DUMP_PANIC:
- return "Panic";
- case KMSG_DUMP_OOPS:
- return "Oops";
- case KMSG_DUMP_EMERG:
- return "Emergency";
- case KMSG_DUMP_RESTART:
- return "Restart";
- case KMSG_DUMP_HALT:
- return "Halt";
- case KMSG_DUMP_POWEROFF:
- return "Poweroff";
- default:
- return "Unknown";
- }
+ if (pstore_update_ms < 0)
+ return;
+
+ mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
}
/*
@@ -393,7 +387,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned int part = 1;
int ret;
- why = get_reason_str(reason);
+ why = kmsg_dump_reason_str(reason);
if (down_trylock(&psinfo->buf_lock)) {
/* Failed to acquire lock: give up if we cannot wait. */
@@ -459,8 +453,10 @@ static void pstore_dump(struct kmsg_dumper *dumper,
}
ret = psinfo->write(&record);
- if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
+ if (ret == 0 && reason == KMSG_DUMP_OOPS) {
pstore_new_entry = 1;
+ pstore_timer_kick();
+ }
total += record.size;
part++;
@@ -503,14 +499,20 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
}
static struct console pstore_console = {
- .name = "pstore",
.write = pstore_console_write,
- .flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME,
.index = -1,
};
static void pstore_register_console(void)
{
+ /* Show which backend is going to get console writes. */
+ strscpy(pstore_console.name, psinfo->name,
+ sizeof(pstore_console.name));
+ /*
+ * Always initialize flags here since prior unregister_console()
+ * calls may have changed settings (specifically CON_ENABLED).
+ */
+ pstore_console.flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME;
register_console(&pstore_console);
}
@@ -555,8 +557,6 @@ out:
*/
int pstore_register(struct pstore_info *psi)
{
- struct module *owner = psi->owner;
-
if (backend && strcmp(backend, psi->name)) {
pr_warn("ignoring unexpected backend '%s'\n", psi->name);
return -EPERM;
@@ -576,11 +576,11 @@ int pstore_register(struct pstore_info *psi)
return -EINVAL;
}
- spin_lock(&pstore_lock);
+ mutex_lock(&psinfo_lock);
if (psinfo) {
pr_warn("backend '%s' already loaded: ignoring '%s'\n",
psinfo->name, psi->name);
- spin_unlock(&pstore_lock);
+ mutex_unlock(&psinfo_lock);
return -EBUSY;
}
@@ -589,21 +589,16 @@ int pstore_register(struct pstore_info *psi)
psinfo = psi;
mutex_init(&psinfo->read_mutex);
sema_init(&psinfo->buf_lock, 1);
- spin_unlock(&pstore_lock);
-
- if (owner && !try_module_get(owner)) {
- psinfo = NULL;
- return -EINVAL;
- }
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
- if (pstore_is_mounted())
- pstore_get_records(0);
+ pstore_get_records(0);
- if (psi->flags & PSTORE_FLAGS_DMESG)
+ if (psi->flags & PSTORE_FLAGS_DMESG) {
+ pstore_dumper.max_reason = psinfo->max_reason;
pstore_register_kmsg();
+ }
if (psi->flags & PSTORE_FLAGS_CONSOLE)
pstore_register_console();
if (psi->flags & PSTORE_FLAGS_FTRACE)
@@ -612,33 +607,36 @@ int pstore_register(struct pstore_info *psi)
pstore_register_pmsg();
/* Start watching for new records, if desired. */
- if (pstore_update_ms >= 0) {
- pstore_timer.expires = jiffies +
- msecs_to_jiffies(pstore_update_ms);
- add_timer(&pstore_timer);
- }
+ pstore_timer_kick();
/*
* Update the module parameter backend, so it is visible
* through /sys/module/pstore/parameters/backend
*/
- backend = psi->name;
+ backend = kstrdup(psi->name, GFP_KERNEL);
pr_info("Registered %s as persistent store backend\n", psi->name);
- module_put(owner);
-
+ mutex_unlock(&psinfo_lock);
return 0;
}
EXPORT_SYMBOL_GPL(pstore_register);
void pstore_unregister(struct pstore_info *psi)
{
- /* Stop timer and make sure all work has finished. */
- pstore_update_ms = -1;
- del_timer_sync(&pstore_timer);
- flush_work(&pstore_work);
+ /* It's okay to unregister nothing. */
+ if (!psi)
+ return;
+
+ mutex_lock(&psinfo_lock);
+
+ /* Only one backend can be registered at a time. */
+ if (WARN_ON(psi != psinfo)) {
+ mutex_unlock(&psinfo_lock);
+ return;
+ }
+ /* Unregister all callbacks. */
if (psi->flags & PSTORE_FLAGS_PMSG)
pstore_unregister_pmsg();
if (psi->flags & PSTORE_FLAGS_FTRACE)
@@ -648,10 +646,19 @@ void pstore_unregister(struct pstore_info *psi)
if (psi->flags & PSTORE_FLAGS_DMESG)
pstore_unregister_kmsg();
+ /* Stop timer and make sure all work has finished. */
+ del_timer_sync(&pstore_timer);
+ flush_work(&pstore_work);
+
+ /* Remove all backend records from filesystem tree. */
+ pstore_put_backend_records(psi);
+
free_buf_for_compression();
psinfo = NULL;
+ kfree(backend);
backend = NULL;
+ mutex_unlock(&psinfo_lock);
}
EXPORT_SYMBOL_GPL(pstore_unregister);
@@ -788,9 +795,7 @@ static void pstore_timefunc(struct timer_list *unused)
schedule_work(&pstore_work);
}
- if (pstore_update_ms >= 0)
- mod_timer(&pstore_timer,
- jiffies + msecs_to_jiffies(pstore_update_ms));
+ pstore_timer_kick();
}
static void __init pstore_choose_compression(void)
@@ -835,11 +840,5 @@ static void __exit pstore_exit(void)
}
module_exit(pstore_exit)
-module_param(compress, charp, 0444);
-MODULE_PARM_DESC(compress, "Pstore compression to use");
-
-module_param(backend, charp, 0444);
-MODULE_PARM_DESC(backend, "Pstore backend to use");
-
MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
MODULE_LICENSE("GPL");
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 795622190c01..ca6d8a867285 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -21,6 +21,7 @@
#include <linux/pstore_ram.h>
#include <linux/of.h>
#include <linux/of_address.h>
+#include "internal.h"
#define RAMOOPS_KERNMSG_HDR "===="
#define MIN_MEM_SIZE 4096UL
@@ -53,22 +54,27 @@ MODULE_PARM_DESC(mem_size,
"size of reserved RAM used to store oops/panic logs");
static unsigned int mem_type;
-module_param(mem_type, uint, 0600);
+module_param(mem_type, uint, 0400);
MODULE_PARM_DESC(mem_type,
"set to 1 to try to use unbuffered memory (default 0)");
-static int dump_oops = 1;
-module_param(dump_oops, int, 0600);
-MODULE_PARM_DESC(dump_oops,
- "set to 1 to dump oopses, 0 to only dump panics (default 1)");
+static int ramoops_max_reason = -1;
+module_param_named(max_reason, ramoops_max_reason, int, 0400);
+MODULE_PARM_DESC(max_reason,
+ "maximum reason for kmsg dump (default 2: Oops and Panic) ");
static int ramoops_ecc;
-module_param_named(ecc, ramoops_ecc, int, 0600);
+module_param_named(ecc, ramoops_ecc, int, 0400);
MODULE_PARM_DESC(ramoops_ecc,
"if non-zero, the option enables ECC support and specifies "
"ECC buffer size in bytes (1 is a special value, means 16 "
"bytes ECC)");
+static int ramoops_dump_oops = -1;
+module_param_named(dump_oops, ramoops_dump_oops, int, 0400);
+MODULE_PARM_DESC(dump_oops,
+ "(deprecated: use max_reason instead) set to 1 to dump oopses & panics, 0 to only dump panics");
+
struct ramoops_context {
struct persistent_ram_zone **dprzs; /* Oops dump zones */
struct persistent_ram_zone *cprz; /* Console zone */
@@ -81,7 +87,6 @@ struct ramoops_context {
size_t console_size;
size_t ftrace_size;
size_t pmsg_size;
- int dump_oops;
u32 flags;
struct persistent_ram_ecc_info ecc_info;
unsigned int max_dump_cnt;
@@ -168,58 +173,6 @@ static bool prz_ok(struct persistent_ram_zone *prz)
persistent_ram_ecc_string(prz, NULL, 0));
}
-static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest,
- struct persistent_ram_zone *src)
-{
- size_t dest_size, src_size, total, dest_off, src_off;
- size_t dest_idx = 0, src_idx = 0, merged_idx = 0;
- void *merged_buf;
- struct pstore_ftrace_record *drec, *srec, *mrec;
- size_t record_size = sizeof(struct pstore_ftrace_record);
-
- dest_off = dest->old_log_size % record_size;
- dest_size = dest->old_log_size - dest_off;
-
- src_off = src->old_log_size % record_size;
- src_size = src->old_log_size - src_off;
-
- total = dest_size + src_size;
- merged_buf = kmalloc(total, GFP_KERNEL);
- if (!merged_buf)
- return -ENOMEM;
-
- drec = (struct pstore_ftrace_record *)(dest->old_log + dest_off);
- srec = (struct pstore_ftrace_record *)(src->old_log + src_off);
- mrec = (struct pstore_ftrace_record *)(merged_buf);
-
- while (dest_size > 0 && src_size > 0) {
- if (pstore_ftrace_read_timestamp(&drec[dest_idx]) <
- pstore_ftrace_read_timestamp(&srec[src_idx])) {
- mrec[merged_idx++] = drec[dest_idx++];
- dest_size -= record_size;
- } else {
- mrec[merged_idx++] = srec[src_idx++];
- src_size -= record_size;
- }
- }
-
- while (dest_size > 0) {
- mrec[merged_idx++] = drec[dest_idx++];
- dest_size -= record_size;
- }
-
- while (src_size > 0) {
- mrec[merged_idx++] = srec[src_idx++];
- src_size -= record_size;
- }
-
- kfree(dest->old_log);
- dest->old_log = merged_buf;
- dest->old_log_size = total;
-
- return 0;
-}
-
static ssize_t ramoops_pstore_read(struct pstore_record *record)
{
ssize_t size = 0;
@@ -291,7 +244,12 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
tmp_prz->corrected_bytes +=
prz_next->corrected_bytes;
tmp_prz->bad_blocks += prz_next->bad_blocks;
- size = ftrace_log_combine(tmp_prz, prz_next);
+
+ size = pstore_ftrace_combine_log(
+ &tmp_prz->old_log,
+ &tmp_prz->old_log_size,
+ prz_next->old_log,
+ prz_next->old_log_size);
if (size)
goto out;
}
@@ -382,16 +340,14 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
return -EINVAL;
/*
- * Out of the various dmesg dump types, ramoops is currently designed
- * to only store crash logs, rather than storing general kernel logs.
+ * We could filter on record->reason here if we wanted to (which
+ * would duplicate what happened before the "max_reason" setting
+ * was added), but that would defeat the purpose of a system
+ * changing printk.always_kmsg_dump, so instead log everything that
+ * the kmsg dumper sends us, since it should be doing the filtering
+ * based on the combination of printk.always_kmsg_dump and our
+ * requested "max_reason".
*/
- if (record->reason != KMSG_DUMP_OOPS &&
- record->reason != KMSG_DUMP_PANIC)
- return -EINVAL;
-
- /* Skip Oopes when configured to do so. */
- if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
- return -EINVAL;
/*
* Explicitly only take the first part of any new crash.
@@ -644,19 +600,25 @@ static int ramoops_init_prz(const char *name,
return 0;
}
-static int ramoops_parse_dt_size(struct platform_device *pdev,
- const char *propname, u32 *value)
+/* Read a u32 from a dt property and make sure it's safe for an int. */
+static int ramoops_parse_dt_u32(struct platform_device *pdev,
+ const char *propname,
+ u32 default_value, u32 *value)
{
u32 val32 = 0;
int ret;
ret = of_property_read_u32(pdev->dev.of_node, propname, &val32);
- if (ret < 0 && ret != -EINVAL) {
+ if (ret == -EINVAL) {
+ /* field is missing, use default value. */
+ val32 = default_value;
+ } else if (ret < 0) {
dev_err(&pdev->dev, "failed to parse property %s: %d\n",
propname, ret);
return ret;
}
+ /* Sanity check our results. */
if (val32 > INT_MAX) {
dev_err(&pdev->dev, "%s %u > INT_MAX\n", propname, val32);
return -EOVERFLOW;
@@ -687,23 +649,32 @@ static int ramoops_parse_dt(struct platform_device *pdev,
pdata->mem_size = resource_size(res);
pdata->mem_address = res->start;
pdata->mem_type = of_property_read_bool(of_node, "unbuffered");
- pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops");
-
-#define parse_size(name, field) { \
- ret = ramoops_parse_dt_size(pdev, name, &value); \
+ /*
+ * Setting "no-dump-oops" is deprecated and will be ignored if
+ * "max_reason" is also specified.
+ */
+ if (of_property_read_bool(of_node, "no-dump-oops"))
+ pdata->max_reason = KMSG_DUMP_PANIC;
+ else
+ pdata->max_reason = KMSG_DUMP_OOPS;
+
+#define parse_u32(name, field, default_value) { \
+ ret = ramoops_parse_dt_u32(pdev, name, default_value, \
+ &value); \
if (ret < 0) \
return ret; \
field = value; \
}
- parse_size("record-size", pdata->record_size);
- parse_size("console-size", pdata->console_size);
- parse_size("ftrace-size", pdata->ftrace_size);
- parse_size("pmsg-size", pdata->pmsg_size);
- parse_size("ecc-size", pdata->ecc_info.ecc_size);
- parse_size("flags", pdata->flags);
+ parse_u32("record-size", pdata->record_size, 0);
+ parse_u32("console-size", pdata->console_size, 0);
+ parse_u32("ftrace-size", pdata->ftrace_size, 0);
+ parse_u32("pmsg-size", pdata->pmsg_size, 0);
+ parse_u32("ecc-size", pdata->ecc_info.ecc_size, 0);
+ parse_u32("flags", pdata->flags, 0);
+ parse_u32("max-reason", pdata->max_reason, pdata->max_reason);
-#undef parse_size
+#undef parse_u32
/*
* Some old Chromebooks relied on the kernel setting the
@@ -785,7 +756,6 @@ static int ramoops_probe(struct platform_device *pdev)
cxt->console_size = pdata->console_size;
cxt->ftrace_size = pdata->ftrace_size;
cxt->pmsg_size = pdata->pmsg_size;
- cxt->dump_oops = pdata->dump_oops;
cxt->flags = pdata->flags;
cxt->ecc_info = pdata->ecc_info;
@@ -828,8 +798,10 @@ static int ramoops_probe(struct platform_device *pdev)
* the single region size is how to check.
*/
cxt->pstore.flags = 0;
- if (cxt->max_dump_cnt)
+ if (cxt->max_dump_cnt) {
cxt->pstore.flags |= PSTORE_FLAGS_DMESG;
+ cxt->pstore.max_reason = pdata->max_reason;
+ }
if (cxt->console_size)
cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
if (cxt->max_ftrace_cnt)
@@ -865,7 +837,7 @@ static int ramoops_probe(struct platform_device *pdev)
mem_size = pdata->mem_size;
mem_address = pdata->mem_address;
record_size = pdata->record_size;
- dump_oops = pdata->dump_oops;
+ ramoops_max_reason = pdata->max_reason;
ramoops_console_size = pdata->console_size;
ramoops_pmsg_size = pdata->pmsg_size;
ramoops_ftrace_size = pdata->ftrace_size;
@@ -948,7 +920,16 @@ static void __init ramoops_register_dummy(void)
pdata.console_size = ramoops_console_size;
pdata.ftrace_size = ramoops_ftrace_size;
pdata.pmsg_size = ramoops_pmsg_size;
- pdata.dump_oops = dump_oops;
+ /* If "max_reason" is set, its value has priority over "dump_oops". */
+ if (ramoops_max_reason >= 0)
+ pdata.max_reason = ramoops_max_reason;
+ /* Otherwise, if "dump_oops" is set, parse it into "max_reason". */
+ else if (ramoops_dump_oops != -1)
+ pdata.max_reason = ramoops_dump_oops ? KMSG_DUMP_OOPS
+ : KMSG_DUMP_PANIC;
+ /* And if neither are explicitly set, use the default. */
+ else
+ pdata.max_reason = KMSG_DUMP_OOPS;
pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
/*
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
new file mode 100644
index 000000000000..819428dfa32f
--- /dev/null
+++ b/fs/pstore/zone.c
@@ -0,0 +1,1465 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide a pstore intermediate backend, organized into kernel memory
+ * allocated zones that are then mapped and flushed into a single
+ * contiguous region on a storage backend of some kind (block, mtd, etc).
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/printk.h>
+#include <linux/fs.h>
+#include <linux/pstore_zone.h>
+#include <linux/kdev_t.h>
+#include <linux/device.h>
+#include <linux/namei.h>
+#include <linux/fcntl.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include "internal.h"
+
+/**
+ * struct psz_head - header of zone to flush to storage
+ *
+ * @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value)
+ * @datalen: length of data in @data
+ * @start: offset into @data where the beginning of the stored bytes begin
+ * @data: zone data.
+ */
+struct psz_buffer {
+#define PSZ_SIG (0x43474244) /* DBGC */
+ uint32_t sig;
+ atomic_t datalen;
+ atomic_t start;
+ uint8_t data[];
+};
+
+/**
+ * struct psz_kmsg_header - kmsg dump-specific header to flush to storage
+ *
+ * @magic: magic num for kmsg dump header
+ * @time: kmsg dump trigger time
+ * @compressed: whether conpressed
+ * @counter: kmsg dump counter
+ * @reason: the kmsg dump reason (e.g. oops, panic, etc)
+ * @data: pointer to log data
+ *
+ * This is a sub-header for a kmsg dump, trailing after &psz_buffer.
+ */
+struct psz_kmsg_header {
+#define PSTORE_KMSG_HEADER_MAGIC 0x4dfc3ae5 /* Just a random number */
+ uint32_t magic;
+ struct timespec64 time;
+ bool compressed;
+ uint32_t counter;
+ enum kmsg_dump_reason reason;
+ uint8_t data[];
+};
+
+/**
+ * struct pstore_zone - single stored buffer
+ *
+ * @off: zone offset of storage
+ * @type: front-end type for this zone
+ * @name: front-end name for this zone
+ * @buffer: pointer to data buffer managed by this zone
+ * @oldbuf: pointer to old data buffer
+ * @buffer_size: bytes in @buffer->data
+ * @should_recover: whether this zone should recover from storage
+ * @dirty: whether the data in @buffer dirty
+ *
+ * zone structure in memory.
+ */
+struct pstore_zone {
+ loff_t off;
+ const char *name;
+ enum pstore_type_id type;
+
+ struct psz_buffer *buffer;
+ struct psz_buffer *oldbuf;
+ size_t buffer_size;
+ bool should_recover;
+ atomic_t dirty;
+};
+
+/**
+ * struct psz_context - all about running state of pstore/zone
+ *
+ * @kpszs: kmsg dump storage zones
+ * @ppsz: pmsg storage zone
+ * @cpsz: console storage zone
+ * @fpszs: ftrace storage zones
+ * @kmsg_max_cnt: max count of @kpszs
+ * @kmsg_read_cnt: counter of total read kmsg dumps
+ * @kmsg_write_cnt: counter of total kmsg dump writes
+ * @pmsg_read_cnt: counter of total read pmsg zone
+ * @console_read_cnt: counter of total read console zone
+ * @ftrace_max_cnt: max count of @fpszs
+ * @ftrace_read_cnt: counter of max read ftrace zone
+ * @oops_counter: counter of oops dumps
+ * @panic_counter: counter of panic dumps
+ * @recovered: whether finished recovering data from storage
+ * @on_panic: whether panic is happening
+ * @pstore_zone_info_lock: lock to @pstore_zone_info
+ * @pstore_zone_info: information from backend
+ * @pstore: structure for pstore
+ */
+struct psz_context {
+ struct pstore_zone **kpszs;
+ struct pstore_zone *ppsz;
+ struct pstore_zone *cpsz;
+ struct pstore_zone **fpszs;
+ unsigned int kmsg_max_cnt;
+ unsigned int kmsg_read_cnt;
+ unsigned int kmsg_write_cnt;
+ unsigned int pmsg_read_cnt;
+ unsigned int console_read_cnt;
+ unsigned int ftrace_max_cnt;
+ unsigned int ftrace_read_cnt;
+ /*
+ * These counters should be calculated during recovery.
+ * It records the oops/panic times after crashes rather than boots.
+ */
+ unsigned int oops_counter;
+ unsigned int panic_counter;
+ atomic_t recovered;
+ atomic_t on_panic;
+
+ /*
+ * pstore_zone_info_lock protects this entire structure during calls
+ * to register_pstore_zone()/unregister_pstore_zone().
+ */
+ struct mutex pstore_zone_info_lock;
+ struct pstore_zone_info *pstore_zone_info;
+ struct pstore_info pstore;
+};
+static struct psz_context pstore_zone_cxt;
+
+static void psz_flush_all_dirty_zones(struct work_struct *);
+static DECLARE_DELAYED_WORK(psz_cleaner, psz_flush_all_dirty_zones);
+
+/**
+ * enum psz_flush_mode - flush mode for psz_zone_write()
+ *
+ * @FLUSH_NONE: do not flush to storage but update data on memory
+ * @FLUSH_PART: just flush part of data including meta data to storage
+ * @FLUSH_META: just flush meta data of zone to storage
+ * @FLUSH_ALL: flush all of zone
+ */
+enum psz_flush_mode {
+ FLUSH_NONE = 0,
+ FLUSH_PART,
+ FLUSH_META,
+ FLUSH_ALL,
+};
+
+static inline int buffer_datalen(struct pstore_zone *zone)
+{
+ return atomic_read(&zone->buffer->datalen);
+}
+
+static inline int buffer_start(struct pstore_zone *zone)
+{
+ return atomic_read(&zone->buffer->start);
+}
+
+static inline bool is_on_panic(void)
+{
+ return atomic_read(&pstore_zone_cxt.on_panic);
+}
+
+static ssize_t psz_zone_read_buffer(struct pstore_zone *zone, char *buf,
+ size_t len, unsigned long off)
+{
+ if (!buf || !zone || !zone->buffer)
+ return -EINVAL;
+ if (off > zone->buffer_size)
+ return -EINVAL;
+ len = min_t(size_t, len, zone->buffer_size - off);
+ memcpy(buf, zone->buffer->data + off, len);
+ return len;
+}
+
+static int psz_zone_read_oldbuf(struct pstore_zone *zone, char *buf,
+ size_t len, unsigned long off)
+{
+ if (!buf || !zone || !zone->oldbuf)
+ return -EINVAL;
+ if (off > zone->buffer_size)
+ return -EINVAL;
+ len = min_t(size_t, len, zone->buffer_size - off);
+ memcpy(buf, zone->oldbuf->data + off, len);
+ return 0;
+}
+
+static int psz_zone_write(struct pstore_zone *zone,
+ enum psz_flush_mode flush_mode, const char *buf,
+ size_t len, unsigned long off)
+{
+ struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info;
+ ssize_t wcnt = 0;
+ ssize_t (*writeop)(const char *buf, size_t bytes, loff_t pos);
+ size_t wlen;
+
+ if (off > zone->buffer_size)
+ return -EINVAL;
+
+ wlen = min_t(size_t, len, zone->buffer_size - off);
+ if (buf && wlen) {
+ memcpy(zone->buffer->data + off, buf, wlen);
+ atomic_set(&zone->buffer->datalen, wlen + off);
+ }
+
+ /* avoid to damage old records */
+ if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered))
+ goto dirty;
+
+ writeop = is_on_panic() ? info->panic_write : info->write;
+ if (!writeop)
+ goto dirty;
+
+ switch (flush_mode) {
+ case FLUSH_NONE:
+ if (unlikely(buf && wlen))
+ goto dirty;
+ return 0;
+ case FLUSH_PART:
+ wcnt = writeop((const char *)zone->buffer->data + off, wlen,
+ zone->off + sizeof(*zone->buffer) + off);
+ if (wcnt != wlen)
+ goto dirty;
+ fallthrough;
+ case FLUSH_META:
+ wlen = sizeof(struct psz_buffer);
+ wcnt = writeop((const char *)zone->buffer, wlen, zone->off);
+ if (wcnt != wlen)
+ goto dirty;
+ break;
+ case FLUSH_ALL:
+ wlen = zone->buffer_size + sizeof(*zone->buffer);
+ wcnt = writeop((const char *)zone->buffer, wlen, zone->off);
+ if (wcnt != wlen)
+ goto dirty;
+ break;
+ }
+
+ return 0;
+dirty:
+ /* no need to mark dirty if going to try next zone */
+ if (wcnt == -ENOMSG)
+ return -ENOMSG;
+ atomic_set(&zone->dirty, true);
+ /* flush dirty zones nicely */
+ if (wcnt == -EBUSY && !is_on_panic())
+ schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(500));
+ return -EBUSY;
+}
+
+static int psz_flush_dirty_zone(struct pstore_zone *zone)
+{
+ int ret;
+
+ if (unlikely(!zone))
+ return -EINVAL;
+
+ if (unlikely(!atomic_read(&pstore_zone_cxt.recovered)))
+ return -EBUSY;
+
+ if (!atomic_xchg(&zone->dirty, false))
+ return 0;
+
+ ret = psz_zone_write(zone, FLUSH_ALL, NULL, 0, 0);
+ if (ret)
+ atomic_set(&zone->dirty, true);
+ return ret;
+}
+
+static int psz_flush_dirty_zones(struct pstore_zone **zones, unsigned int cnt)
+{
+ int i, ret;
+ struct pstore_zone *zone;
+
+ if (!zones)
+ return -EINVAL;
+
+ for (i = 0; i < cnt; i++) {
+ zone = zones[i];
+ if (!zone)
+ return -EINVAL;
+ ret = psz_flush_dirty_zone(zone);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int psz_move_zone(struct pstore_zone *old, struct pstore_zone *new)
+{
+ const char *data = (const char *)old->buffer->data;
+ int ret;
+
+ ret = psz_zone_write(new, FLUSH_ALL, data, buffer_datalen(old), 0);
+ if (ret) {
+ atomic_set(&new->buffer->datalen, 0);
+ atomic_set(&new->dirty, false);
+ return ret;
+ }
+ atomic_set(&old->buffer->datalen, 0);
+ return 0;
+}
+
+static void psz_flush_all_dirty_zones(struct work_struct *work)
+{
+ struct psz_context *cxt = &pstore_zone_cxt;
+ int ret = 0;
+
+ if (cxt->ppsz)
+ ret |= psz_flush_dirty_zone(cxt->ppsz);
+ if (cxt->cpsz)
+ ret |= psz_flush_dirty_zone(cxt->cpsz);
+ if (cxt->kpszs)
+ ret |= psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt);
+ if (cxt->fpszs)
+ ret |= psz_flush_dirty_zones(cxt->fpszs, cxt->ftrace_max_cnt);
+ if (ret && cxt->pstore_zone_info)
+ schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(1000));
+}
+
+static int psz_kmsg_recover_data(struct psz_context *cxt)
+{
+ struct pstore_zone_info *info = cxt->pstore_zone_info;
+ struct pstore_zone *zone = NULL;
+ struct psz_buffer *buf;
+ unsigned long i;
+ ssize_t rcnt;
+
+ if (!info->read)
+ return -EINVAL;
+
+ for (i = 0; i < cxt->kmsg_max_cnt; i++) {
+ zone = cxt->kpszs[i];
+ if (unlikely(!zone))
+ return -EINVAL;
+ if (atomic_read(&zone->dirty)) {
+ unsigned int wcnt = cxt->kmsg_write_cnt;
+ struct pstore_zone *new = cxt->kpszs[wcnt];
+ int ret;
+
+ ret = psz_move_zone(zone, new);
+ if (ret) {
+ pr_err("move zone from %lu to %d failed\n",
+ i, wcnt);
+ return ret;
+ }
+ cxt->kmsg_write_cnt = (wcnt + 1) % cxt->kmsg_max_cnt;
+ }
+ if (!zone->should_recover)
+ continue;
+ buf = zone->buffer;
+ rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf),
+ zone->off);
+ if (rcnt != zone->buffer_size + sizeof(*buf))
+ return (int)rcnt < 0 ? (int)rcnt : -EIO;
+ }
+ return 0;
+}
+
+static int psz_kmsg_recover_meta(struct psz_context *cxt)
+{
+ struct pstore_zone_info *info = cxt->pstore_zone_info;
+ struct pstore_zone *zone;
+ size_t rcnt, len;
+ struct psz_buffer *buf;
+ struct psz_kmsg_header *hdr;
+ struct timespec64 time = { };
+ unsigned long i;
+ /*
+ * Recover may on panic, we can't allocate any memory by kmalloc.
+ * So, we use local array instead.
+ */
+ char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0};
+
+ if (!info->read)
+ return -EINVAL;
+
+ len = sizeof(*buf) + sizeof(*hdr);
+ buf = (struct psz_buffer *)buffer_header;
+ for (i = 0; i < cxt->kmsg_max_cnt; i++) {
+ zone = cxt->kpszs[i];
+ if (unlikely(!zone))
+ return -EINVAL;
+
+ rcnt = info->read((char *)buf, len, zone->off);
+ if (rcnt == -ENOMSG) {
+ pr_debug("%s with id %lu may be broken, skip\n",
+ zone->name, i);
+ continue;
+ } else if (rcnt != len) {
+ pr_err("read %s with id %lu failed\n", zone->name, i);
+ return (int)rcnt < 0 ? (int)rcnt : -EIO;
+ }
+
+ if (buf->sig != zone->buffer->sig) {
+ pr_debug("no valid data in kmsg dump zone %lu\n", i);
+ continue;
+ }
+
+ if (zone->buffer_size < atomic_read(&buf->datalen)) {
+ pr_info("found overtop zone: %s: id %lu, off %lld, size %zu\n",
+ zone->name, i, zone->off,
+ zone->buffer_size);
+ continue;
+ }
+
+ hdr = (struct psz_kmsg_header *)buf->data;
+ if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) {
+ pr_info("found invalid zone: %s: id %lu, off %lld, size %zu\n",
+ zone->name, i, zone->off,
+ zone->buffer_size);
+ continue;
+ }
+
+ /*
+ * we get the newest zone, and the next one must be the oldest
+ * or unused zone, because we do write one by one like a circle.
+ */
+ if (hdr->time.tv_sec >= time.tv_sec) {
+ time.tv_sec = hdr->time.tv_sec;
+ cxt->kmsg_write_cnt = (i + 1) % cxt->kmsg_max_cnt;
+ }
+
+ if (hdr->reason == KMSG_DUMP_OOPS)
+ cxt->oops_counter =
+ max(cxt->oops_counter, hdr->counter);
+ else if (hdr->reason == KMSG_DUMP_PANIC)
+ cxt->panic_counter =
+ max(cxt->panic_counter, hdr->counter);
+
+ if (!atomic_read(&buf->datalen)) {
+ pr_debug("found erased zone: %s: id %lu, off %lld, size %zu, datalen %d\n",
+ zone->name, i, zone->off,
+ zone->buffer_size,
+ atomic_read(&buf->datalen));
+ continue;
+ }
+
+ if (!is_on_panic())
+ zone->should_recover = true;
+ pr_debug("found nice zone: %s: id %lu, off %lld, size %zu, datalen %d\n",
+ zone->name, i, zone->off,
+ zone->buffer_size, atomic_read(&buf->datalen));
+ }
+
+ return 0;
+}
+
+static int psz_kmsg_recover(struct psz_context *cxt)
+{
+ int ret;
+
+ if (!cxt->kpszs)
+ return 0;
+
+ ret = psz_kmsg_recover_meta(cxt);
+ if (ret)
+ goto recover_fail;
+
+ ret = psz_kmsg_recover_data(cxt);
+ if (ret)
+ goto recover_fail;
+
+ return 0;
+recover_fail:
+ pr_debug("psz_recover_kmsg failed\n");
+ return ret;
+}
+
+static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone)
+{
+ struct pstore_zone_info *info = cxt->pstore_zone_info;
+ struct psz_buffer *oldbuf, tmpbuf;
+ int ret = 0;
+ char *buf;
+ ssize_t rcnt, len, start, off;
+
+ if (!zone || zone->oldbuf)
+ return 0;
+
+ if (is_on_panic()) {
+ /* save data as much as possible */
+ psz_flush_dirty_zone(zone);
+ return 0;
+ }
+
+ if (unlikely(!info->read))
+ return -EINVAL;
+
+ len = sizeof(struct psz_buffer);
+ rcnt = info->read((char *)&tmpbuf, len, zone->off);
+ if (rcnt != len) {
+ pr_debug("read zone %s failed\n", zone->name);
+ return (int)rcnt < 0 ? (int)rcnt : -EIO;
+ }
+
+ if (tmpbuf.sig != zone->buffer->sig) {
+ pr_debug("no valid data in zone %s\n", zone->name);
+ return 0;
+ }
+
+ if (zone->buffer_size < atomic_read(&tmpbuf.datalen) ||
+ zone->buffer_size < atomic_read(&tmpbuf.start)) {
+ pr_info("found overtop zone: %s: off %lld, size %zu\n",
+ zone->name, zone->off, zone->buffer_size);
+ /* just keep going */
+ return 0;
+ }
+
+ if (!atomic_read(&tmpbuf.datalen)) {
+ pr_debug("found erased zone: %s: off %lld, size %zu, datalen %d\n",
+ zone->name, zone->off, zone->buffer_size,
+ atomic_read(&tmpbuf.datalen));
+ return 0;
+ }
+
+ pr_debug("found nice zone: %s: off %lld, size %zu, datalen %d\n",
+ zone->name, zone->off, zone->buffer_size,
+ atomic_read(&tmpbuf.datalen));
+
+ len = atomic_read(&tmpbuf.datalen) + sizeof(*oldbuf);
+ oldbuf = kzalloc(len, GFP_KERNEL);
+ if (!oldbuf)
+ return -ENOMEM;
+
+ memcpy(oldbuf, &tmpbuf, sizeof(*oldbuf));
+ buf = (char *)oldbuf + sizeof(*oldbuf);
+ len = atomic_read(&oldbuf->datalen);
+ start = atomic_read(&oldbuf->start);
+ off = zone->off + sizeof(*oldbuf);
+
+ /* get part of data */
+ rcnt = info->read(buf, len - start, off + start);
+ if (rcnt != len - start) {
+ pr_err("read zone %s failed\n", zone->name);
+ ret = (int)rcnt < 0 ? (int)rcnt : -EIO;
+ goto free_oldbuf;
+ }
+
+ /* get the rest of data */
+ rcnt = info->read(buf + len - start, start, off);
+ if (rcnt != start) {
+ pr_err("read zone %s failed\n", zone->name);
+ ret = (int)rcnt < 0 ? (int)rcnt : -EIO;
+ goto free_oldbuf;
+ }
+
+ zone->oldbuf = oldbuf;
+ psz_flush_dirty_zone(zone);
+ return 0;
+
+free_oldbuf:
+ kfree(oldbuf);
+ return ret;
+}
+
+static int psz_recover_zones(struct psz_context *cxt,
+ struct pstore_zone **zones, unsigned int cnt)
+{
+ int ret;
+ unsigned int i;
+ struct pstore_zone *zone;
+
+ if (!zones)
+ return 0;
+
+ for (i = 0; i < cnt; i++) {
+ zone = zones[i];
+ if (unlikely(!zone))
+ continue;
+ ret = psz_recover_zone(cxt, zone);
+ if (ret)
+ goto recover_fail;
+ }
+
+ return 0;
+recover_fail:
+ pr_debug("recover %s[%u] failed\n", zone->name, i);
+ return ret;
+}
+
+/**
+ * psz_recovery() - recover data from storage
+ * @cxt: the context of pstore/zone
+ *
+ * recovery means reading data back from storage after rebooting
+ *
+ * Return: 0 on success, others on failure.
+ */
+static inline int psz_recovery(struct psz_context *cxt)
+{
+ int ret;
+
+ if (atomic_read(&cxt->recovered))
+ return 0;
+
+ ret = psz_kmsg_recover(cxt);
+ if (ret)
+ goto out;
+
+ ret = psz_recover_zone(cxt, cxt->ppsz);
+ if (ret)
+ goto out;
+
+ ret = psz_recover_zone(cxt, cxt->cpsz);
+ if (ret)
+ goto out;
+
+ ret = psz_recover_zones(cxt, cxt->fpszs, cxt->ftrace_max_cnt);
+
+out:
+ if (unlikely(ret))
+ pr_err("recover failed\n");
+ else {
+ pr_debug("recover end!\n");
+ atomic_set(&cxt->recovered, 1);
+ }
+ return ret;
+}
+
+static int psz_pstore_open(struct pstore_info *psi)
+{
+ struct psz_context *cxt = psi->data;
+
+ cxt->kmsg_read_cnt = 0;
+ cxt->pmsg_read_cnt = 0;
+ cxt->console_read_cnt = 0;
+ cxt->ftrace_read_cnt = 0;
+ return 0;
+}
+
+static inline bool psz_old_ok(struct pstore_zone *zone)
+{
+ if (zone && zone->oldbuf && atomic_read(&zone->oldbuf->datalen))
+ return true;
+ return false;
+}
+
+static inline bool psz_ok(struct pstore_zone *zone)
+{
+ if (zone && zone->buffer && buffer_datalen(zone))
+ return true;
+ return false;
+}
+
+static inline int psz_kmsg_erase(struct psz_context *cxt,
+ struct pstore_zone *zone, struct pstore_record *record)
+{
+ struct psz_buffer *buffer = zone->buffer;
+ struct psz_kmsg_header *hdr =
+ (struct psz_kmsg_header *)buffer->data;
+ size_t size;
+
+ if (unlikely(!psz_ok(zone)))
+ return 0;
+
+ /* this zone is already updated, no need to erase */
+ if (record->count != hdr->counter)
+ return 0;
+
+ size = buffer_datalen(zone) + sizeof(*zone->buffer);
+ atomic_set(&zone->buffer->datalen, 0);
+ if (cxt->pstore_zone_info->erase)
+ return cxt->pstore_zone_info->erase(size, zone->off);
+ else
+ return psz_zone_write(zone, FLUSH_META, NULL, 0, 0);
+}
+
+static inline int psz_record_erase(struct psz_context *cxt,
+ struct pstore_zone *zone)
+{
+ if (unlikely(!psz_old_ok(zone)))
+ return 0;
+
+ kfree(zone->oldbuf);
+ zone->oldbuf = NULL;
+ /*
+ * if there are new data in zone buffer, that means the old data
+ * are already invalid. It is no need to flush 0 (erase) to
+ * block device.
+ */
+ if (!buffer_datalen(zone))
+ return psz_zone_write(zone, FLUSH_META, NULL, 0, 0);
+ psz_flush_dirty_zone(zone);
+ return 0;
+}
+
+static int psz_pstore_erase(struct pstore_record *record)
+{
+ struct psz_context *cxt = record->psi->data;
+
+ switch (record->type) {
+ case PSTORE_TYPE_DMESG:
+ if (record->id >= cxt->kmsg_max_cnt)
+ return -EINVAL;
+ return psz_kmsg_erase(cxt, cxt->kpszs[record->id], record);
+ case PSTORE_TYPE_PMSG:
+ return psz_record_erase(cxt, cxt->ppsz);
+ case PSTORE_TYPE_CONSOLE:
+ return psz_record_erase(cxt, cxt->cpsz);
+ case PSTORE_TYPE_FTRACE:
+ if (record->id >= cxt->ftrace_max_cnt)
+ return -EINVAL;
+ return psz_record_erase(cxt, cxt->fpszs[record->id]);
+ default: return -EINVAL;
+ }
+}
+
+static void psz_write_kmsg_hdr(struct pstore_zone *zone,
+ struct pstore_record *record)
+{
+ struct psz_context *cxt = record->psi->data;
+ struct psz_buffer *buffer = zone->buffer;
+ struct psz_kmsg_header *hdr =
+ (struct psz_kmsg_header *)buffer->data;
+
+ hdr->magic = PSTORE_KMSG_HEADER_MAGIC;
+ hdr->compressed = record->compressed;
+ hdr->time.tv_sec = record->time.tv_sec;
+ hdr->time.tv_nsec = record->time.tv_nsec;
+ hdr->reason = record->reason;
+ if (hdr->reason == KMSG_DUMP_OOPS)
+ hdr->counter = ++cxt->oops_counter;
+ else if (hdr->reason == KMSG_DUMP_PANIC)
+ hdr->counter = ++cxt->panic_counter;
+ else
+ hdr->counter = 0;
+}
+
+/*
+ * In case zone is broken, which may occur to MTD device, we try each zones,
+ * start at cxt->kmsg_write_cnt.
+ */
+static inline int notrace psz_kmsg_write_record(struct psz_context *cxt,
+ struct pstore_record *record)
+{
+ size_t size, hlen;
+ struct pstore_zone *zone;
+ unsigned int i;
+
+ for (i = 0; i < cxt->kmsg_max_cnt; i++) {
+ unsigned int zonenum, len;
+ int ret;
+
+ zonenum = (cxt->kmsg_write_cnt + i) % cxt->kmsg_max_cnt;
+ zone = cxt->kpszs[zonenum];
+ if (unlikely(!zone))
+ return -ENOSPC;
+
+ /* avoid destroying old data, allocate a new one */
+ len = zone->buffer_size + sizeof(*zone->buffer);
+ zone->oldbuf = zone->buffer;
+ zone->buffer = kzalloc(len, GFP_KERNEL);
+ if (!zone->buffer) {
+ zone->buffer = zone->oldbuf;
+ return -ENOMEM;
+ }
+ zone->buffer->sig = zone->oldbuf->sig;
+
+ pr_debug("write %s to zone id %d\n", zone->name, zonenum);
+ psz_write_kmsg_hdr(zone, record);
+ hlen = sizeof(struct psz_kmsg_header);
+ size = min_t(size_t, record->size, zone->buffer_size - hlen);
+ ret = psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen);
+ if (likely(!ret || ret != -ENOMSG)) {
+ cxt->kmsg_write_cnt = zonenum + 1;
+ cxt->kmsg_write_cnt %= cxt->kmsg_max_cnt;
+ /* no need to try next zone, free last zone buffer */
+ kfree(zone->oldbuf);
+ zone->oldbuf = NULL;
+ return ret;
+ }
+
+ pr_debug("zone %u may be broken, try next dmesg zone\n",
+ zonenum);
+ kfree(zone->buffer);
+ zone->buffer = zone->oldbuf;
+ zone->oldbuf = NULL;
+ }
+
+ return -EBUSY;
+}
+
+static int notrace psz_kmsg_write(struct psz_context *cxt,
+ struct pstore_record *record)
+{
+ int ret;
+
+ /*
+ * Explicitly only take the first part of any new crash.
+ * If our buffer is larger than kmsg_bytes, this can never happen,
+ * and if our buffer is smaller than kmsg_bytes, we don't want the
+ * report split across multiple records.
+ */
+ if (record->part != 1)
+ return -ENOSPC;
+
+ if (!cxt->kpszs)
+ return -ENOSPC;
+
+ ret = psz_kmsg_write_record(cxt, record);
+ if (!ret && is_on_panic()) {
+ /* ensure all data are flushed to storage when panic */
+ pr_debug("try to flush other dirty zones\n");
+ psz_flush_all_dirty_zones(NULL);
+ }
+
+ /* always return 0 as we had handled it on buffer */
+ return 0;
+}
+
+static int notrace psz_record_write(struct pstore_zone *zone,
+ struct pstore_record *record)
+{
+ size_t start, rem;
+ bool is_full_data = false;
+ char *buf;
+ int cnt;
+
+ if (!zone || !record)
+ return -ENOSPC;
+
+ if (atomic_read(&zone->buffer->datalen) >= zone->buffer_size)
+ is_full_data = true;
+
+ cnt = record->size;
+ buf = record->buf;
+ if (unlikely(cnt > zone->buffer_size)) {
+ buf += cnt - zone->buffer_size;
+ cnt = zone->buffer_size;
+ }
+
+ start = buffer_start(zone);
+ rem = zone->buffer_size - start;
+ if (unlikely(rem < cnt)) {
+ psz_zone_write(zone, FLUSH_PART, buf, rem, start);
+ buf += rem;
+ cnt -= rem;
+ start = 0;
+ is_full_data = true;
+ }
+
+ atomic_set(&zone->buffer->start, cnt + start);
+ psz_zone_write(zone, FLUSH_PART, buf, cnt, start);
+
+ /**
+ * psz_zone_write will set datalen as start + cnt.
+ * It work if actual data length lesser than buffer size.
+ * If data length greater than buffer size, pmsg will rewrite to
+ * beginning of zone, which make buffer->datalen wrongly.
+ * So we should reset datalen as buffer size once actual data length
+ * greater than buffer size.
+ */
+ if (is_full_data) {
+ atomic_set(&zone->buffer->datalen, zone->buffer_size);
+ psz_zone_write(zone, FLUSH_META, NULL, 0, 0);
+ }
+ return 0;
+}
+
+static int notrace psz_pstore_write(struct pstore_record *record)
+{
+ struct psz_context *cxt = record->psi->data;
+
+ if (record->type == PSTORE_TYPE_DMESG &&
+ record->reason == KMSG_DUMP_PANIC)
+ atomic_set(&cxt->on_panic, 1);
+
+ /*
+ * if on panic, do not write except panic records
+ * Fix case that panic_write prints log which wakes up console backend.
+ */
+ if (is_on_panic() && record->type != PSTORE_TYPE_DMESG)
+ return -EBUSY;
+
+ switch (record->type) {
+ case PSTORE_TYPE_DMESG:
+ return psz_kmsg_write(cxt, record);
+ case PSTORE_TYPE_CONSOLE:
+ return psz_record_write(cxt->cpsz, record);
+ case PSTORE_TYPE_PMSG:
+ return psz_record_write(cxt->ppsz, record);
+ case PSTORE_TYPE_FTRACE: {
+ int zonenum = smp_processor_id();
+
+ if (!cxt->fpszs)
+ return -ENOSPC;
+ return psz_record_write(cxt->fpszs[zonenum], record);
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt)
+{
+ struct pstore_zone *zone = NULL;
+
+ while (cxt->kmsg_read_cnt < cxt->kmsg_max_cnt) {
+ zone = cxt->kpszs[cxt->kmsg_read_cnt++];
+ if (psz_ok(zone))
+ return zone;
+ }
+
+ if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt)
+ /*
+ * No need psz_old_ok(). Let psz_ftrace_read() do so for
+ * combination. psz_ftrace_read() should traverse over
+ * all zones in case of some zone without data.
+ */
+ return cxt->fpszs[cxt->ftrace_read_cnt++];
+
+ if (cxt->pmsg_read_cnt == 0) {
+ cxt->pmsg_read_cnt++;
+ zone = cxt->ppsz;
+ if (psz_old_ok(zone))
+ return zone;
+ }
+
+ if (cxt->console_read_cnt == 0) {
+ cxt->console_read_cnt++;
+ zone = cxt->cpsz;
+ if (psz_old_ok(zone))
+ return zone;
+ }
+
+ return NULL;
+}
+
+static int psz_kmsg_read_hdr(struct pstore_zone *zone,
+ struct pstore_record *record)
+{
+ struct psz_buffer *buffer = zone->buffer;
+ struct psz_kmsg_header *hdr =
+ (struct psz_kmsg_header *)buffer->data;
+
+ if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC)
+ return -EINVAL;
+ record->compressed = hdr->compressed;
+ record->time.tv_sec = hdr->time.tv_sec;
+ record->time.tv_nsec = hdr->time.tv_nsec;
+ record->reason = hdr->reason;
+ record->count = hdr->counter;
+ return 0;
+}
+
+static ssize_t psz_kmsg_read(struct pstore_zone *zone,
+ struct pstore_record *record)
+{
+ ssize_t size, hlen = 0;
+
+ size = buffer_datalen(zone);
+ /* Clear and skip this kmsg dump record if it has no valid header */
+ if (psz_kmsg_read_hdr(zone, record)) {
+ atomic_set(&zone->buffer->datalen, 0);
+ atomic_set(&zone->dirty, 0);
+ return -ENOMSG;
+ }
+ size -= sizeof(struct psz_kmsg_header);
+
+ if (!record->compressed) {
+ char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n",
+ kmsg_dump_reason_str(record->reason),
+ record->count);
+ hlen = strlen(buf);
+ record->buf = krealloc(buf, hlen + size, GFP_KERNEL);
+ if (!record->buf) {
+ kfree(buf);
+ return -ENOMEM;
+ }
+ } else {
+ record->buf = kmalloc(size, GFP_KERNEL);
+ if (!record->buf)
+ return -ENOMEM;
+ }
+
+ size = psz_zone_read_buffer(zone, record->buf + hlen, size,
+ sizeof(struct psz_kmsg_header));
+ if (unlikely(size < 0)) {
+ kfree(record->buf);
+ return -ENOMSG;
+ }
+
+ return size + hlen;
+}
+
+/* try to combine all ftrace zones */
+static ssize_t psz_ftrace_read(struct pstore_zone *zone,
+ struct pstore_record *record)
+{
+ struct psz_context *cxt;
+ struct psz_buffer *buf;
+ int ret;
+
+ if (!zone || !record)
+ return -ENOSPC;
+
+ if (!psz_old_ok(zone))
+ goto out;
+
+ buf = (struct psz_buffer *)zone->oldbuf;
+ if (!buf)
+ return -ENOMSG;
+
+ ret = pstore_ftrace_combine_log(&record->buf, &record->size,
+ (char *)buf->data, atomic_read(&buf->datalen));
+ if (unlikely(ret))
+ return ret;
+
+out:
+ cxt = record->psi->data;
+ if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt)
+ /* then, read next ftrace zone */
+ return -ENOMSG;
+ record->id = 0;
+ return record->size ? record->size : -ENOMSG;
+}
+
+static ssize_t psz_record_read(struct pstore_zone *zone,
+ struct pstore_record *record)
+{
+ size_t len;
+ struct psz_buffer *buf;
+
+ if (!zone || !record)
+ return -ENOSPC;
+
+ buf = (struct psz_buffer *)zone->oldbuf;
+ if (!buf)
+ return -ENOMSG;
+
+ len = atomic_read(&buf->datalen);
+ record->buf = kmalloc(len, GFP_KERNEL);
+ if (!record->buf)
+ return -ENOMEM;
+
+ if (unlikely(psz_zone_read_oldbuf(zone, record->buf, len, 0))) {
+ kfree(record->buf);
+ return -ENOMSG;
+ }
+
+ return len;
+}
+
+static ssize_t psz_pstore_read(struct pstore_record *record)
+{
+ struct psz_context *cxt = record->psi->data;
+ ssize_t (*readop)(struct pstore_zone *zone,
+ struct pstore_record *record);
+ struct pstore_zone *zone;
+ ssize_t ret;
+
+ /* before read, we must recover from storage */
+ ret = psz_recovery(cxt);
+ if (ret)
+ return ret;
+
+next_zone:
+ zone = psz_read_next_zone(cxt);
+ if (!zone)
+ return 0;
+
+ record->type = zone->type;
+ switch (record->type) {
+ case PSTORE_TYPE_DMESG:
+ readop = psz_kmsg_read;
+ record->id = cxt->kmsg_read_cnt - 1;
+ break;
+ case PSTORE_TYPE_FTRACE:
+ readop = psz_ftrace_read;
+ break;
+ case PSTORE_TYPE_CONSOLE:
+ fallthrough;
+ case PSTORE_TYPE_PMSG:
+ readop = psz_record_read;
+ break;
+ default:
+ goto next_zone;
+ }
+
+ ret = readop(zone, record);
+ if (ret == -ENOMSG)
+ goto next_zone;
+ return ret;
+}
+
+static struct psz_context pstore_zone_cxt = {
+ .pstore_zone_info_lock =
+ __MUTEX_INITIALIZER(pstore_zone_cxt.pstore_zone_info_lock),
+ .recovered = ATOMIC_INIT(0),
+ .on_panic = ATOMIC_INIT(0),
+ .pstore = {
+ .owner = THIS_MODULE,
+ .open = psz_pstore_open,
+ .read = psz_pstore_read,
+ .write = psz_pstore_write,
+ .erase = psz_pstore_erase,
+ },
+};
+
+static void psz_free_zone(struct pstore_zone **pszone)
+{
+ struct pstore_zone *zone = *pszone;
+
+ if (!zone)
+ return;
+
+ kfree(zone->buffer);
+ kfree(zone);
+ *pszone = NULL;
+}
+
+static void psz_free_zones(struct pstore_zone ***pszones, unsigned int *cnt)
+{
+ struct pstore_zone **zones = *pszones;
+
+ if (!zones)
+ return;
+
+ while (*cnt > 0) {
+ (*cnt)--;
+ psz_free_zone(&(zones[*cnt]));
+ }
+ kfree(zones);
+ *pszones = NULL;
+}
+
+static void psz_free_all_zones(struct psz_context *cxt)
+{
+ if (cxt->kpszs)
+ psz_free_zones(&cxt->kpszs, &cxt->kmsg_max_cnt);
+ if (cxt->ppsz)
+ psz_free_zone(&cxt->ppsz);
+ if (cxt->cpsz)
+ psz_free_zone(&cxt->cpsz);
+ if (cxt->fpszs)
+ psz_free_zones(&cxt->fpszs, &cxt->ftrace_max_cnt);
+}
+
+static struct pstore_zone *psz_init_zone(enum pstore_type_id type,
+ loff_t *off, size_t size)
+{
+ struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info;
+ struct pstore_zone *zone;
+ const char *name = pstore_type_to_name(type);
+
+ if (!size)
+ return NULL;
+
+ if (*off + size > info->total_size) {
+ pr_err("no room for %s (0x%zx@0x%llx over 0x%lx)\n",
+ name, size, *off, info->total_size);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ zone = kzalloc(sizeof(struct pstore_zone), GFP_KERNEL);
+ if (!zone)
+ return ERR_PTR(-ENOMEM);
+
+ zone->buffer = kmalloc(size, GFP_KERNEL);
+ if (!zone->buffer) {
+ kfree(zone);
+ return ERR_PTR(-ENOMEM);
+ }
+ memset(zone->buffer, 0xFF, size);
+ zone->off = *off;
+ zone->name = name;
+ zone->type = type;
+ zone->buffer_size = size - sizeof(struct psz_buffer);
+ zone->buffer->sig = type ^ PSZ_SIG;
+ zone->oldbuf = NULL;
+ atomic_set(&zone->dirty, 0);
+ atomic_set(&zone->buffer->datalen, 0);
+ atomic_set(&zone->buffer->start, 0);
+
+ *off += size;
+
+ pr_debug("pszone %s: off 0x%llx, %zu header, %zu data\n", zone->name,
+ zone->off, sizeof(*zone->buffer), zone->buffer_size);
+ return zone;
+}
+
+static struct pstore_zone **psz_init_zones(enum pstore_type_id type,
+ loff_t *off, size_t total_size, ssize_t record_size,
+ unsigned int *cnt)
+{
+ struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info;
+ struct pstore_zone **zones, *zone;
+ const char *name = pstore_type_to_name(type);
+ int c, i;
+
+ *cnt = 0;
+ if (!total_size || !record_size)
+ return NULL;
+
+ if (*off + total_size > info->total_size) {
+ pr_err("no room for zones %s (0x%zx@0x%llx over 0x%lx)\n",
+ name, total_size, *off, info->total_size);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ c = total_size / record_size;
+ zones = kcalloc(c, sizeof(*zones), GFP_KERNEL);
+ if (!zones) {
+ pr_err("allocate for zones %s failed\n", name);
+ return ERR_PTR(-ENOMEM);
+ }
+ memset(zones, 0, c * sizeof(*zones));
+
+ for (i = 0; i < c; i++) {
+ zone = psz_init_zone(type, off, record_size);
+ if (!zone || IS_ERR(zone)) {
+ pr_err("initialize zones %s failed\n", name);
+ psz_free_zones(&zones, &i);
+ return (void *)zone;
+ }
+ zones[i] = zone;
+ }
+
+ *cnt = c;
+ return zones;
+}
+
+static int psz_alloc_zones(struct psz_context *cxt)
+{
+ struct pstore_zone_info *info = cxt->pstore_zone_info;
+ loff_t off = 0;
+ int err;
+ size_t off_size = 0;
+
+ off_size += info->pmsg_size;
+ cxt->ppsz = psz_init_zone(PSTORE_TYPE_PMSG, &off, info->pmsg_size);
+ if (IS_ERR(cxt->ppsz)) {
+ err = PTR_ERR(cxt->ppsz);
+ cxt->ppsz = NULL;
+ goto free_out;
+ }
+
+ off_size += info->console_size;
+ cxt->cpsz = psz_init_zone(PSTORE_TYPE_CONSOLE, &off,
+ info->console_size);
+ if (IS_ERR(cxt->cpsz)) {
+ err = PTR_ERR(cxt->cpsz);
+ cxt->cpsz = NULL;
+ goto free_out;
+ }
+
+ off_size += info->ftrace_size;
+ cxt->fpszs = psz_init_zones(PSTORE_TYPE_FTRACE, &off,
+ info->ftrace_size,
+ info->ftrace_size / nr_cpu_ids,
+ &cxt->ftrace_max_cnt);
+ if (IS_ERR(cxt->fpszs)) {
+ err = PTR_ERR(cxt->fpszs);
+ cxt->fpszs = NULL;
+ goto free_out;
+ }
+
+ cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off,
+ info->total_size - off_size,
+ info->kmsg_size, &cxt->kmsg_max_cnt);
+ if (IS_ERR(cxt->kpszs)) {
+ err = PTR_ERR(cxt->kpszs);
+ cxt->kpszs = NULL;
+ goto free_out;
+ }
+
+ return 0;
+free_out:
+ psz_free_all_zones(cxt);
+ return err;
+}
+
+/**
+ * register_pstore_zone() - register to pstore/zone
+ *
+ * @info: back-end driver information. See &struct pstore_zone_info.
+ *
+ * Only one back-end at one time.
+ *
+ * Return: 0 on success, others on failure.
+ */
+int register_pstore_zone(struct pstore_zone_info *info)
+{
+ int err = -EINVAL;
+ struct psz_context *cxt = &pstore_zone_cxt;
+
+ if (info->total_size < 4096) {
+ pr_warn("total_size must be >= 4096\n");
+ return -EINVAL;
+ }
+
+ if (!info->kmsg_size && !info->pmsg_size && !info->console_size &&
+ !info->ftrace_size) {
+ pr_warn("at least one record size must be non-zero\n");
+ return -EINVAL;
+ }
+
+ if (!info->name || !info->name[0])
+ return -EINVAL;
+
+#define check_size(name, size) { \
+ if (info->name > 0 && info->name < (size)) { \
+ pr_err(#name " must be over %d\n", (size)); \
+ return -EINVAL; \
+ } \
+ if (info->name & (size - 1)) { \
+ pr_err(#name " must be a multiple of %d\n", \
+ (size)); \
+ return -EINVAL; \
+ } \
+ }
+
+ check_size(total_size, 4096);
+ check_size(kmsg_size, SECTOR_SIZE);
+ check_size(pmsg_size, SECTOR_SIZE);
+ check_size(console_size, SECTOR_SIZE);
+ check_size(ftrace_size, SECTOR_SIZE);
+
+#undef check_size
+
+ /*
+ * the @read and @write must be applied.
+ * if no @read, pstore may mount failed.
+ * if no @write, pstore do not support to remove record file.
+ */
+ if (!info->read || !info->write) {
+ pr_err("no valid general read/write interface\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&cxt->pstore_zone_info_lock);
+ if (cxt->pstore_zone_info) {
+ pr_warn("'%s' already loaded: ignoring '%s'\n",
+ cxt->pstore_zone_info->name, info->name);
+ mutex_unlock(&cxt->pstore_zone_info_lock);
+ return -EBUSY;
+ }
+ cxt->pstore_zone_info = info;
+
+ pr_debug("register %s with properties:\n", info->name);
+ pr_debug("\ttotal size : %ld Bytes\n", info->total_size);
+ pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size);
+ pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size);
+ pr_debug("\tconsole size : %ld Bytes\n", info->console_size);
+ pr_debug("\tftrace size : %ld Bytes\n", info->ftrace_size);
+
+ err = psz_alloc_zones(cxt);
+ if (err) {
+ pr_err("alloc zones failed\n");
+ goto fail_out;
+ }
+
+ if (info->kmsg_size) {
+ cxt->pstore.bufsize = cxt->kpszs[0]->buffer_size -
+ sizeof(struct psz_kmsg_header);
+ cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL);
+ if (!cxt->pstore.buf) {
+ err = -ENOMEM;
+ goto fail_free;
+ }
+ }
+ cxt->pstore.data = cxt;
+
+ pr_info("registered %s as backend for", info->name);
+ cxt->pstore.max_reason = info->max_reason;
+ cxt->pstore.name = info->name;
+ if (info->kmsg_size) {
+ cxt->pstore.flags |= PSTORE_FLAGS_DMESG;
+ pr_cont(" kmsg(%s",
+ kmsg_dump_reason_str(cxt->pstore.max_reason));
+ if (cxt->pstore_zone_info->panic_write)
+ pr_cont(",panic_write");
+ pr_cont(")");
+ }
+ if (info->pmsg_size) {
+ cxt->pstore.flags |= PSTORE_FLAGS_PMSG;
+ pr_cont(" pmsg");
+ }
+ if (info->console_size) {
+ cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
+ pr_cont(" console");
+ }
+ if (info->ftrace_size) {
+ cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
+ pr_cont(" ftrace");
+ }
+ pr_cont("\n");
+
+ err = pstore_register(&cxt->pstore);
+ if (err) {
+ pr_err("registering with pstore failed\n");
+ goto fail_free;
+ }
+ mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock);
+
+ return 0;
+
+fail_free:
+ kfree(cxt->pstore.buf);
+ cxt->pstore.buf = NULL;
+ cxt->pstore.bufsize = 0;
+ psz_free_all_zones(cxt);
+fail_out:
+ pstore_zone_cxt.pstore_zone_info = NULL;
+ mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_pstore_zone);
+
+/**
+ * unregister_pstore_zone() - unregister to pstore/zone
+ *
+ * @info: back-end driver information. See struct pstore_zone_info.
+ */
+void unregister_pstore_zone(struct pstore_zone_info *info)
+{
+ struct psz_context *cxt = &pstore_zone_cxt;
+
+ mutex_lock(&cxt->pstore_zone_info_lock);
+ if (!cxt->pstore_zone_info) {
+ mutex_unlock(&cxt->pstore_zone_info_lock);
+ return;
+ }
+
+ /* Stop incoming writes from pstore. */
+ pstore_unregister(&cxt->pstore);
+
+ /* Flush any pending writes. */
+ psz_flush_all_dirty_zones(NULL);
+ flush_delayed_work(&psz_cleaner);
+
+ /* Clean up allocations. */
+ kfree(cxt->pstore.buf);
+ cxt->pstore.buf = NULL;
+ cxt->pstore.bufsize = 0;
+ cxt->pstore_zone_info = NULL;
+
+ psz_free_all_zones(cxt);
+
+ /* Clear counters and zone state. */
+ cxt->oops_counter = 0;
+ cxt->panic_counter = 0;
+ atomic_set(&cxt->recovered, 0);
+ atomic_set(&cxt->on_panic, 0);
+
+ mutex_unlock(&cxt->pstore_zone_info_lock);
+}
+EXPORT_SYMBOL_GPL(unregister_pstore_zone);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("WeiXiong Liao <liaoweixiong@allwinnertech.com>");
+MODULE_AUTHOR("Kees Cook <keescook@chromium.org>");
+MODULE_DESCRIPTION("Storage Manager for pstore/blk");
diff --git a/fs/splice.c b/fs/splice.c
index 4735defc46ee..4e53efbd621d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1118,6 +1118,10 @@ long do_splice(struct file *in, loff_t __user *off_in,
loff_t offset;
long ret;
+ if (unlikely(!(in->f_mode & FMODE_READ) ||
+ !(out->f_mode & FMODE_WRITE)))
+ return -EBADF;
+
ipipe = get_pipe_info(in);
opipe = get_pipe_info(out);
@@ -1125,12 +1129,6 @@ long do_splice(struct file *in, loff_t __user *off_in,
if (off_in || off_out)
return -ESPIPE;
- if (!(in->f_mode & FMODE_READ))
- return -EBADF;
-
- if (!(out->f_mode & FMODE_WRITE))
- return -EBADF;
-
/* Splicing to self would be fun, but... */
if (ipipe == opipe)
return -EINVAL;
@@ -1153,9 +1151,6 @@ long do_splice(struct file *in, loff_t __user *off_in,
offset = out->f_pos;
}
- if (unlikely(!(out->f_mode & FMODE_WRITE)))
- return -EBADF;
-
if (unlikely(out->f_flags & O_APPEND))
return -EINVAL;
@@ -1440,15 +1435,11 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
error = -EBADF;
in = fdget(fd_in);
if (in.file) {
- if (in.file->f_mode & FMODE_READ) {
- out = fdget(fd_out);
- if (out.file) {
- if (out.file->f_mode & FMODE_WRITE)
- error = do_splice(in.file, off_in,
- out.file, off_out,
- len, flags);
- fdput(out);
- }
+ out = fdget(fd_out);
+ if (out.file) {
+ error = do_splice(in.file, off_in, out.file, off_out,
+ len, flags);
+ fdput(out);
}
fdput(in);
}
@@ -1503,7 +1494,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
* Check pipe occupancy without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
return 0;
ret = 0;
@@ -1770,6 +1761,10 @@ static long do_tee(struct file *in, struct file *out, size_t len,
struct pipe_inode_info *opipe = get_pipe_info(out);
int ret = -EINVAL;
+ if (unlikely(!(in->f_mode & FMODE_READ) ||
+ !(out->f_mode & FMODE_WRITE)))
+ return -EBADF;
+
/*
* Duplicate the contents of ipipe to opipe without actually
* copying the data.
@@ -1795,7 +1790,7 @@ static long do_tee(struct file *in, struct file *out, size_t len,
SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
{
- struct fd in;
+ struct fd in, out;
int error;
if (unlikely(flags & ~SPLICE_F_ALL))
@@ -1807,14 +1802,10 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
error = -EBADF;
in = fdget(fdin);
if (in.file) {
- if (in.file->f_mode & FMODE_READ) {
- struct fd out = fdget(fdout);
- if (out.file) {
- if (out.file->f_mode & FMODE_WRITE)
- error = do_tee(in.file, out.file,
- len, flags);
- fdput(out);
- }
+ out = fdget(fdout);
+ if (out.file) {
+ error = do_tee(in.file, out.file, len, flags);
+ fdput(out);
}
fdput(in);
}
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
index 2a2a2d106440..e206ebfe003c 100644
--- a/fs/squashfs/decompressor_multi_percpu.c
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -8,6 +8,7 @@
#include <linux/slab.h>
#include <linux/percpu.h>
#include <linux/buffer_head.h>
+#include <linux/local_lock.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -20,7 +21,8 @@
*/
struct squashfs_stream {
- void *stream;
+ void *stream;
+ local_lock_t lock;
};
void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
@@ -41,6 +43,7 @@ void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
err = PTR_ERR(stream->stream);
goto out;
}
+ local_lock_init(&stream->lock);
}
kfree(comp_opts);
@@ -75,12 +78,16 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
int b, int offset, int length, struct squashfs_page_actor *output)
{
- struct squashfs_stream __percpu *percpu =
- (struct squashfs_stream __percpu *) msblk->stream;
- struct squashfs_stream *stream = get_cpu_ptr(percpu);
- int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
- offset, length, output);
- put_cpu_ptr(stream);
+ struct squashfs_stream *stream;
+ int res;
+
+ local_lock(&msblk->stream->lock);
+ stream = this_cpu_ptr(msblk->stream);
+
+ res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+ offset, length, output);
+
+ local_unlock(&msblk->stream->lock);
if (res < 0)
ERROR("%s decompression failed, data probably corrupt\n",
diff --git a/fs/super.c b/fs/super.c
index cd352530eca9..a288cd60d2ae 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1302,8 +1302,8 @@ int get_tree_bdev(struct fs_context *fc,
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (bdev->bd_fsfreeze_count > 0) {
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- blkdev_put(bdev, mode);
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
+ blkdev_put(bdev, mode);
return -EBUSY;
}
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 8cdbd53d780c..cc5c0abfd536 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -31,15 +31,9 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node,
u8 *hash)
{
const struct ubifs_ch *ch = node;
- SHASH_DESC_ON_STACK(shash, c->hash_tfm);
- int err;
-
- shash->tfm = c->hash_tfm;
- err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash);
- if (err < 0)
- return err;
- return 0;
+ return crypto_shash_tfm_digest(c->hash_tfm, node, le32_to_cpu(ch->len),
+ hash);
}
/**
@@ -53,15 +47,7 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node,
static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash,
u8 *hmac)
{
- SHASH_DESC_ON_STACK(shash, c->hmac_tfm);
- int err;
-
- shash->tfm = c->hmac_tfm;
-
- err = crypto_shash_digest(shash, hash, c->hash_len, hmac);
- if (err < 0)
- return err;
- return 0;
+ return crypto_shash_tfm_digest(c->hmac_tfm, hash, c->hash_len, hmac);
}
/**
@@ -79,13 +65,9 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node,
struct shash_desc *inhash)
{
struct ubifs_auth_node *auth = node;
- u8 *hash;
+ u8 hash[UBIFS_HASH_ARR_SZ];
int err;
- hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS);
- if (!hash)
- return -ENOMEM;
-
{
SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm);
@@ -94,21 +76,16 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node,
err = crypto_shash_final(hash_desc, hash);
if (err)
- goto out;
+ return err;
}
err = ubifs_hash_calc_hmac(c, hash, auth->hmac);
if (err)
- goto out;
+ return err;
auth->ch.node_type = UBIFS_AUTH_NODE;
ubifs_prepare_node(c, auth, ubifs_auth_node_sz(c), 0);
-
- err = 0;
-out:
- kfree(hash);
-
- return err;
+ return 0;
}
static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 743928efffc1..49fe062ce45e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1375,7 +1375,6 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time,
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_budget_req req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(ui->data_len, 8) };
- int iflags = I_DIRTY_TIME;
int err, release;
if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
@@ -1393,11 +1392,8 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time,
if (flags & S_MTIME)
inode->i_mtime = *time;
- if (!(inode->i_sb->s_flags & SB_LAZYTIME))
- iflags |= I_DIRTY_SYNC;
-
release = ui->dirty;
- __mark_inode_dirty(inode, iflags);
+ __mark_inode_dirty(inode, I_DIRTY_SYNC);
mutex_unlock(&ui->ui_mutex);
if (release)
ubifs_release_budget(c, &req);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 52a85c01397e..911d0555b9f2 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -68,12 +68,9 @@ static int mst_node_check_hash(const struct ubifs_info *c,
u8 calc[UBIFS_MAX_HASH_LEN];
const void *node = mst;
- SHASH_DESC_ON_STACK(shash, c->hash_tfm);
-
- shash->tfm = c->hash_tfm;
-
- crypto_shash_digest(shash, node + sizeof(struct ubifs_ch),
- UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), calc);
+ crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch),
+ UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch),
+ calc);
if (ubifs_check_hash(c, expected, calc))
return -EPERM;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index b28ac4dfb407..b69ffac7e415 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -558,7 +558,7 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
return data == 0xFFFFFFFF;
}
-/* authenticate_sleb_hash and authenticate_sleb_hmac are split out for stack usage */
+/* authenticate_sleb_hash is split out for stack usage */
static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_hash, u8 *hash)
{
SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm);
@@ -569,15 +569,6 @@ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_h
return crypto_shash_final(hash_desc, hash);
}
-static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac)
-{
- SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm);
-
- hmac_desc->tfm = c->hmac_tfm;
-
- return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac);
-}
-
/**
* authenticate_sleb - authenticate one scan LEB
* @c: UBIFS file-system description object
@@ -601,18 +592,12 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
struct ubifs_scan_node *snod;
int n_nodes = 0;
int err;
- u8 *hash, *hmac;
+ u8 hash[UBIFS_HASH_ARR_SZ];
+ u8 hmac[UBIFS_HMAC_ARR_SZ];
if (!ubifs_authenticated(c))
return sleb->nodes_cnt;
- hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS);
- hmac = kmalloc(c->hmac_desc_len, GFP_NOFS);
- if (!hash || !hmac) {
- err = -ENOMEM;
- goto out;
- }
-
list_for_each_entry(snod, &sleb->nodes, list) {
n_nodes++;
@@ -624,7 +609,8 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
if (err)
goto out;
- err = authenticate_sleb_hmac(c, hash, hmac);
+ err = crypto_shash_tfm_digest(c->hmac_tfm, hash,
+ c->hash_len, hmac);
if (err)
goto out;
@@ -662,9 +648,6 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
err = 0;
}
out:
- kfree(hash);
- kfree(hmac);
-
return err ? err : n_nodes - n_not_auth;
}
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 675e26989376..8fe03b4a0d2b 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -164,7 +164,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
goto fail_free;
}
- err = super_setup_bdi_name(sb, "vboxsf-%s.%d", fc->source, sbi->bdi_id);
+ err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id);
if (err)
goto fail_free;
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index d98bea308fd7..5ab3bbec8108 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -329,6 +329,8 @@ rollback:
/**
* fsverity_ioctl_enable() - enable verity on a file
+ * @filp: file to enable verity on
+ * @uarg: user pointer to fsverity_enable_arg
*
* Enable fs-verity on a file. See the "FS_IOC_ENABLE_VERITY" section of
* Documentation/filesystems/fsverity.rst for the documentation.
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 74768cf539da..e96d99d5145e 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -61,7 +61,7 @@ struct merkle_tree_params {
u64 level_start[FS_VERITY_MAX_LEVELS];
};
-/**
+/*
* fsverity_info - cached verity metadata for an inode
*
* When a verity file is first opened, an instance of this struct is allocated
@@ -134,7 +134,7 @@ void __init fsverity_check_hash_algs(void);
/* init.c */
-extern void __printf(3, 4) __cold
+void __printf(3, 4) __cold
fsverity_msg(const struct inode *inode, const char *level,
const char *fmt, ...);
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index 05049b68c745..df409a5682ed 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -11,6 +11,8 @@
/**
* fsverity_ioctl_measure() - get a verity file's measurement
+ * @filp: file to get measurement of
+ * @_uarg: user pointer to fsverity_digest
*
* Retrieve the file measurement that the kernel is enforcing for reads from a
* verity file. See the "FS_IOC_MEASURE_VERITY" section of
diff --git a/fs/verity/open.c b/fs/verity/open.c
index c5fe6948e262..d007db0c9304 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -330,6 +330,7 @@ EXPORT_SYMBOL_GPL(fsverity_prepare_setattr);
/**
* fsverity_cleanup_inode() - free the inode's verity info, if present
+ * @inode: an inode being evicted
*
* Filesystems must call this on inode eviction to free ->i_verity_info.
*/
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
index c8b255232de5..b14ed96387ec 100644
--- a/fs/verity/signature.c
+++ b/fs/verity/signature.c
@@ -28,6 +28,9 @@ static struct key *fsverity_keyring;
/**
* fsverity_verify_signature() - check a verity file's signature
+ * @vi: the file's fsverity_info
+ * @desc: the file's fsverity_descriptor
+ * @desc_size: size of @desc
*
* If the file's fs-verity descriptor includes a signature of the file
* measurement, verify it against the certificates in the fs-verity keyring.
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index e0cb62da3864..a8b68c6f663d 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -179,6 +179,7 @@ out:
/**
* fsverity_verify_page() - verify a data page
+ * @page: the page to verity
*
* Verify a page that has just been read from a verity file. The page must be a
* pagecache page that is still locked and not yet uptodate.
@@ -206,6 +207,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page);
#ifdef CONFIG_BLOCK
/**
* fsverity_verify_bio() - verify a 'read' bio that has just completed
+ * @bio: the bio to verify
*
* Verify a set of pages that have just been read from a verity file. The pages
* must be pagecache pages that are still locked and not yet uptodate. Pages
@@ -264,6 +266,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_bio);
/**
* fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue
+ * @work: the work to enqueue
*
* Enqueue verification work for asynchronous processing.
*/
diff --git a/fs/xattr.c b/fs/xattr.c
index e13265e65871..91608d9bfc6a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -876,6 +876,9 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
struct simple_xattr *new_xattr = NULL;
int err = 0;
+ if (removed_size)
+ *removed_size = -1;
+
/* value == NULL means remove */
if (value) {
new_xattr = simple_xattr_alloc(value, size);
@@ -914,9 +917,6 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
list_add(&new_xattr->list, &xattrs->head);
xattr = NULL;
}
-
- if (removed_size)
- *removed_size = -1;
out:
spin_unlock(&xattrs->lock);
if (xattr) {
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index a7be7a9e5c1a..8bf1d15be3f6 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -911,7 +911,12 @@ xfs_eofblocks_worker(
{
struct xfs_mount *mp = container_of(to_delayed_work(work),
struct xfs_mount, m_eofblocks_work);
+
+ if (!sb_start_write_trylock(mp->m_super))
+ return;
xfs_icache_free_eofblocks(mp, NULL);
+ sb_end_write(mp->m_super);
+
xfs_queue_eofblocks(mp);
}
@@ -938,7 +943,12 @@ xfs_cowblocks_worker(
{
struct xfs_mount *mp = container_of(to_delayed_work(work),
struct xfs_mount, m_cowblocks_work);
+
+ if (!sb_start_write_trylock(mp->m_super))
+ return;
xfs_icache_free_cowblocks(mp, NULL);
+ sb_end_write(mp->m_super);
+
xfs_queue_cowblocks(mp);
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index cdfb3cd9a25b..309958186d33 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -2363,7 +2363,10 @@ xfs_file_ioctl(
if (error)
return error;
- return xfs_icache_free_eofblocks(mp, &keofb);
+ sb_start_write(mp->m_super);
+ error = xfs_icache_free_eofblocks(mp, &keofb);
+ sb_end_write(mp->m_super);
+ return error;
}
default:
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 50c43422fa17..b2e4598fdf7d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -167,8 +167,12 @@ typedef struct xfs_mount {
struct xfs_kobj m_error_meta_kobj;
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
- struct ratelimit_state m_flush_inodes_ratelimit;
+ /*
+ * Workqueue item so that we can coalesce multiple inode flush attempts
+ * into a single flush.
+ */
+ struct work_struct m_flush_inodes_work;
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
struct workqueue_struct *m_cil_workqueue;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b0ce04ffd3cd..107bf2a2f344 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1051,6 +1051,7 @@ xfs_reflink_remap_extent(
uirec.br_startblock = irec->br_startblock + rlen;
uirec.br_startoff = irec->br_startoff + rlen;
uirec.br_blockcount = unmap_len - rlen;
+ uirec.br_state = irec->br_state;
unmap_len = rlen;
/* If this isn't a real mapping, we're done. */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index abf06bf9c3f3..424bb9a2d532 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -516,6 +516,20 @@ xfs_destroy_mount_workqueues(
destroy_workqueue(mp->m_buf_workqueue);
}
+static void
+xfs_flush_inodes_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(work, struct xfs_mount,
+ m_flush_inodes_work);
+ struct super_block *sb = mp->m_super;
+
+ if (down_read_trylock(&sb->s_umount)) {
+ sync_inodes_sb(sb);
+ up_read(&sb->s_umount);
+ }
+}
+
/*
* Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
* or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
@@ -526,15 +540,15 @@ void
xfs_flush_inodes(
struct xfs_mount *mp)
{
- struct super_block *sb = mp->m_super;
-
- if (!__ratelimit(&mp->m_flush_inodes_ratelimit))
+ /*
+ * If flush_work() returns true then that means we waited for a flush
+ * which was already in progress. Don't bother running another scan.
+ */
+ if (flush_work(&mp->m_flush_inodes_work))
return;
- if (down_read_trylock(&sb->s_umount)) {
- sync_inodes_sb(sb);
- up_read(&sb->s_umount);
- }
+ queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work);
+ flush_work(&mp->m_flush_inodes_work);
}
/* Catch misguided souls that try to use this interface on XFS */
@@ -1369,17 +1383,6 @@ xfs_fc_fill_super(
if (error)
goto out_free_names;
- /*
- * Cap the number of invocations of xfs_flush_inodes to 16 for every
- * quarter of a second. The magic numbers here were determined by
- * observation neither to cause stalls in writeback when there are a
- * lot of IO threads and the fs is near ENOSPC, nor cause any fstest
- * regressions. YMMV.
- */
- ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16);
- ratelimit_set_flags(&mp->m_flush_inodes_ratelimit,
- RATELIMIT_MSG_ON_RELEASE);
-
error = xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
@@ -1752,6 +1755,7 @@ static int xfs_init_fs_context(
spin_lock_init(&mp->m_perag_lock);
mutex_init(&mp->m_growlock);
atomic_set(&mp->m_active_trans, 0);
+ INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);