summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-09-27 12:32:06 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-09-27 12:32:06 -0700
commit34e1a5d43c5deec563b94f3330b690dde9d1de53 (patch)
tree304dbe97fd7e31b5d64f888ea51efabbc7bb9fbb /arch
parent9c44575c78dbcdf89bd9f9bc3869ce8ab5cc1272 (diff)
parent9805f39d423a30a7189158905ec3d71774fe98a1 (diff)
Merge tag 'random-6.12-rc1-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random
Pull more random number generator updates from Jason Donenfeld: - Christophe realized that the LoongArch64 instructions could be scheduled more similar to how GCC generates code, which Ruoyao implemented, for a 5% speedup from basically some rearrangements - An update to MAINTAINERS to match the right files * tag 'random-6.12-rc1-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random: LoongArch: vDSO: Tune chacha implementation MAINTAINERS: make vDSO getrandom matches more generic
Diffstat (limited to 'arch')
-rw-r--r--arch/loongarch/vdso/vgetrandom-chacha.S92
1 files changed, 55 insertions, 37 deletions
diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S
index 7e86a50f6e85..c2733e6c3a8d 100644
--- a/arch/loongarch/vdso/vgetrandom-chacha.S
+++ b/arch/loongarch/vdso/vgetrandom-chacha.S
@@ -9,23 +9,11 @@
.text
-/* Salsa20 quarter-round */
-.macro QR a b c d
- add.w \a, \a, \b
- xor \d, \d, \a
- rotri.w \d, \d, 16
-
- add.w \c, \c, \d
- xor \b, \b, \c
- rotri.w \b, \b, 20
-
- add.w \a, \a, \b
- xor \d, \d, \a
- rotri.w \d, \d, 24
-
- add.w \c, \c, \d
- xor \b, \b, \c
- rotri.w \b, \b, 25
+.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
+ \op \d0, \d0, \s0
+ \op \d1, \d1, \s1
+ \op \d2, \d2, \s2
+ \op \d3, \d3, \s3
.endm
/*
@@ -74,6 +62,23 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
/* Reuse i as copy3 */
#define copy3 i
+/* Packs to be used with OP_4REG */
+#define line0 state0, state1, state2, state3
+#define line1 state4, state5, state6, state7
+#define line2 state8, state9, state10, state11
+#define line3 state12, state13, state14, state15
+
+#define line1_perm state5, state6, state7, state4
+#define line2_perm state10, state11, state8, state9
+#define line3_perm state15, state12, state13, state14
+
+#define copy copy0, copy1, copy2, copy3
+
+#define _16 16, 16, 16, 16
+#define _20 20, 20, 20, 20
+#define _24 24, 24, 24, 24
+#define _25 25, 25, 25, 25
+
/*
* The ABI requires s0-s9 saved, and sp aligned to 16-byte.
* This does not violate the stack-less requirement: no sensitive data
@@ -126,16 +131,38 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
li.w i, 10
.Lpermute:
/* odd round */
- QR state0, state4, state8, state12
- QR state1, state5, state9, state13
- QR state2, state6, state10, state14
- QR state3, state7, state11, state15
+ OP_4REG add.w line0, line1
+ OP_4REG xor line3, line0
+ OP_4REG rotri.w line3, _16
+
+ OP_4REG add.w line2, line3
+ OP_4REG xor line1, line2
+ OP_4REG rotri.w line1, _20
+
+ OP_4REG add.w line0, line1
+ OP_4REG xor line3, line0
+ OP_4REG rotri.w line3, _24
+
+ OP_4REG add.w line2, line3
+ OP_4REG xor line1, line2
+ OP_4REG rotri.w line1, _25
/* even round */
- QR state0, state5, state10, state15
- QR state1, state6, state11, state12
- QR state2, state7, state8, state13
- QR state3, state4, state9, state14
+ OP_4REG add.w line0, line1_perm
+ OP_4REG xor line3_perm, line0
+ OP_4REG rotri.w line3_perm, _16
+
+ OP_4REG add.w line2_perm, line3_perm
+ OP_4REG xor line1_perm, line2_perm
+ OP_4REG rotri.w line1_perm, _20
+
+ OP_4REG add.w line0, line1_perm
+ OP_4REG xor line3_perm, line0
+ OP_4REG rotri.w line3_perm, _24
+
+ OP_4REG add.w line2_perm, line3_perm
+ OP_4REG xor line1_perm, line2_perm
+ OP_4REG rotri.w line1_perm, _25
addi.w i, i, -1
bnez i, .Lpermute
@@ -147,10 +174,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
li.w copy3, 0x6b206574
/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
- add.w state0, state0, copy0
- add.w state1, state1, copy1
- add.w state2, state2, copy2
- add.w state3, state3, copy3
+ OP_4REG add.w line0, copy
st.w state0, output, 0
st.w state1, output, 4
st.w state2, output, 8
@@ -165,10 +189,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
ld.w state3, key, 12
/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
- add.w state4, state4, state0
- add.w state5, state5, state1
- add.w state6, state6, state2
- add.w state7, state7, state3
+ OP_4REG add.w line1, line0
st.w state4, output, 16
st.w state5, output, 20
st.w state6, output, 24
@@ -181,10 +202,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
ld.w state3, key, 28
/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
- add.w state8, state8, state0
- add.w state9, state9, state1
- add.w state10, state10, state2
- add.w state11, state11, state3
+ OP_4REG add.w line2, line0
st.w state8, output, 32
st.w state9, output, 36
st.w state10, output, 40