diff options
author | Marshall Lochbaum <mwlochbaum@gmail.com> | 2024-05-10 08:30:08 -0400 |
---|---|---|
committer | Marshall Lochbaum <mwlochbaum@gmail.com> | 2024-05-10 08:30:08 -0400 |
commit | 526d9bbebe4e5f5539cc6a4ca4978927fbb5feab (patch) | |
tree | b47724fca7536bde8c974f644a5414fc07a3a3ad | |
parent | 0b1ba06bc0d33d91058a4337641e385dfd0752b7 (diff) |
Wrap non-destructuring type parameters in parens
-rw-r--r-- | src/singeli/src/avx.singeli | 4 | ||||
-rw-r--r-- | src/singeli/src/avx2.singeli | 8 | ||||
-rw-r--r-- | src/singeli/src/base.singeli | 16 | ||||
-rw-r--r-- | src/singeli/src/bins.singeli | 2 | ||||
-rw-r--r-- | src/singeli/src/bitops.singeli | 44 | ||||
-rw-r--r-- | src/singeli/src/bmi2.singeli | 8 | ||||
-rw-r--r-- | src/singeli/src/cmp.singeli | 8 | ||||
-rw-r--r-- | src/singeli/src/copy.singeli | 2 | ||||
-rw-r--r-- | src/singeli/src/debug.singeli | 2 | ||||
-rw-r--r-- | src/singeli/src/f64.singeli | 20 | ||||
-rw-r--r-- | src/singeli/src/fold.singeli | 8 | ||||
-rw-r--r-- | src/singeli/src/hashtab.singeli | 6 | ||||
-rw-r--r-- | src/singeli/src/mask.singeli | 4 | ||||
-rw-r--r-- | src/singeli/src/replicate.singeli | 14 | ||||
-rw-r--r-- | src/singeli/src/scan.singeli | 12 | ||||
-rw-r--r-- | src/singeli/src/search.singeli | 8 | ||||
-rw-r--r-- | src/singeli/src/select.singeli | 6 | ||||
-rw-r--r-- | src/singeli/src/sse2.singeli | 24 |
18 files changed, 98 insertions, 98 deletions
diff --git a/src/singeli/src/avx.singeli b/src/singeli/src/avx.singeli index c3c645b0..b4b6894f 100644 --- a/src/singeli/src/avx.singeli +++ b/src/singeli/src/avx.singeli @@ -20,8 +20,8 @@ def unord{a:T,b:T & T==[8]f32} = f32cmpAVX{a,b,3} def unord{a:T,b:T & T==[4]f64} = f64cmpAVX{a,b,3} # f32 arith -def rsqrtE{a:[8]f32} = emit{[8]f32, '_mm256_rsqrt_ps', a} -def rcpE{a:[8]f32} = emit{[8]f32, '_mm256_rcp_ps', a} +def rsqrtE{a:([8]f32)} = emit{[8]f32, '_mm256_rsqrt_ps', a} +def rcpE{a:([8]f32)} = emit{[8]f32, '_mm256_rcp_ps', a} # conversion def half{x:T, i & w256{T} & knum{i}} = n_h{T} ~~ emit{[8]i16, '_mm256_extracti128_si256', v2i{x}, i} diff --git a/src/singeli/src/avx2.singeli b/src/singeli/src/avx2.singeli index 3e317ace..d50e1543 100644 --- a/src/singeli/src/avx2.singeli +++ b/src/singeli/src/avx2.singeli @@ -1,8 +1,8 @@ # questionable pack -def unpackQ{a:[32]i8, b:[32]i8 } = { tup{emit{[16]i16, '_mm256_unpacklo_epi8', a, b}, emit{[16]i16, '_mm256_unpackhi_epi8', a, b}}} -def unpackQ{a:[16]i16, b:[16]i16} = { tup{emit{[ 8]i32, '_mm256_unpacklo_epi16', a, b}, emit{[ 8]i32, '_mm256_unpackhi_epi16', a, b}}} -def unpackQ{a:[ 8]i32, b:[ 8]i32} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi32', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi32', a, b}}} -def unpackQ{a:[ 4]i64, b:[ 4]i64} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi64', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi64', a, b}}} +def unpackQ{a:T,b:T & T==[32]i8 } = { tup{emit{[16]i16, '_mm256_unpacklo_epi8', a, b}, emit{[16]i16, '_mm256_unpackhi_epi8', a, b}}} +def unpackQ{a:T,b:T & T==[16]i16} = { tup{emit{[ 8]i32, '_mm256_unpacklo_epi16', a, b}, emit{[ 8]i32, '_mm256_unpackhi_epi16', a, b}}} +def unpackQ{a:T,b:T & T==[ 8]i32} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi32', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi32', a, b}}} +def unpackQ{a:T,b:T & T==[ 4]i64} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi64', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi64', a, b}}} # inverse of questionable pack; these saturate the argument def packQ{a:T,b:T & T==[16]i16} = emit{[32]i8, '_mm256_packs_epi16', a, b} def packQ{a:T,b:T & T==[ 8]i32} = emit{[16]i16, '_mm256_packs_epi32', a, b} diff --git a/src/singeli/src/base.singeli b/src/singeli/src/base.singeli index d5e11120..7bb4f8e2 100644 --- a/src/singeli/src/base.singeli +++ b/src/singeli/src/base.singeli @@ -32,12 +32,12 @@ def load{p:P, n & isvec{eltype{P}}} = assert{0} def store{p:P, n, v & isvec{eltype{P}}} = assert{0} def load{p:P & isptr{P}} = load{p, 0} # def store{p:P, v & isptr{P}} = store{p, 0, v} -def loadu{p:T & isunsigned{eltype{T}}} = emit{eltype{T}, merge{'loadu_u',fmtnat{elwidth{T}}}, p} -def storeu{p:T, v:eltype{T} & isunsigned{eltype{T}}} = emit{void, merge{'storeu_u',fmtnat{elwidth{T}}}, p, v} -def loadu{p:T & issigned{eltype{T}}} = loadu {*ty_u{eltype{T}} ~~ p} -def storeu{p:T, v:eltype{T} & issigned{eltype{T}}} = storeu{*ty_u{eltype{T}} ~~ p, ty_u{v}} -def loadu{p:T & elwidth{T}==8} = load{p} -def storeu{p:T, v:eltype{T} & elwidth{T}==8} = store{p, v} +def loadu{p:T & isunsigned{eltype{T}}} = emit{eltype{T}, merge{'loadu_u',fmtnat{elwidth{T}}}, p} +def storeu{p:T, v:(eltype{T}) & isunsigned{eltype{T}}} = emit{void, merge{'storeu_u',fmtnat{elwidth{T}}}, p, v} +def loadu{p:T & issigned{eltype{T}}} = loadu {*ty_u{eltype{T}} ~~ p} +def storeu{p:T, v:(eltype{T}) & issigned{eltype{T}}} = storeu{*ty_u{eltype{T}} ~~ p, ty_u{v}} +def loadu{p:T & elwidth{T}==8} = load{p} +def storeu{p:T, v:(eltype{T}) & elwidth{T}==8} = store{p, v} def reinterpret{T, x:X & T==X} = x @@ -47,11 +47,11 @@ def exportT{name, fs} = { v:*type{tupsel{0,fs}} = fs; export{name, v} } # hints def rare{x & knum{x}} = x -def rare{x:u1} = emit{u1, '__builtin_expect', x, 0} +def rare{x:(u1)} = emit{u1, '__builtin_expect', x, 0} def assert{x & x==0} = assert{'failed assertion'} def assert{x & x==1} = 1 def unreachable{} = emit{void, 'si_unreachable'} -def assert{x:u1} = { if (not x) emit{void, 'si_unreachable'} } +def assert{x:(u1)} = { if (not x) emit{void, 'si_unreachable'} } # various checks def oneVal{{h, ...t}} = { diff --git a/src/singeli/src/bins.singeli b/src/singeli/src/bins.singeli index adc56ef7..8aade258 100644 --- a/src/singeli/src/bins.singeli +++ b/src/singeli/src/bins.singeli @@ -111,7 +111,7 @@ fn write_indices{I,T}(t:*I, w:*T, n:u64) : void = { fn write_indices{I,T & width{I}==8}(t:*I, w:*T, n:u64) : void = { @for (w over j to n) store{t, w, cast_i{I, j+1}} } -def bins_lookup{I, T, up, w:*T, wn:u64, x:*T, xn:u64, rp:*void} = { +def bins_lookup{I, T, up, w:*T, wn:(u64), x:*T, xn:(u64), rp:(*void)} = { # Build table def tc = 1<<width{T} t0:*I = talloc{I, tc} diff --git a/src/singeli/src/bitops.singeli b/src/singeli/src/bitops.singeli index 8eb08149..e7818571 100644 --- a/src/singeli/src/bitops.singeli +++ b/src/singeli/src/bitops.singeli @@ -1,26 +1,26 @@ def ones{T, n} = (cast{T,1}<<cast{T,n}) - 1 -def b_get{x:*u64, n:(ux)} = { +def b_get{x:(*u64), n:(ux)} = { ((load{x,n>>6}>>(n&63)) & 1) != 0 } -def b_getBatchLo{sz, x:*u64, n:(ux) & sz==2} = (load{*u8~~x, n>>2} >> cast_i{u8, (n&3)*2}) -def b_getBatchLo{sz, x:*u64, n:(ux) & sz==4} = (load{*u8~~x, n>>1} >> cast_i{u8, (n&1)*4}) -def b_getBatchLo{sz, x:*u64, n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n} +def b_getBatchLo{sz, x:(*u64), n:(ux) & sz==2} = (load{*u8~~x, n>>2} >> cast_i{u8, (n&3)*2}) +def b_getBatchLo{sz, x:(*u64), n:(ux) & sz==4} = (load{*u8~~x, n>>1} >> cast_i{u8, (n&1)*4}) +def b_getBatchLo{sz, x:(*u64), n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n} -def b_getBatch{sz, x:*u64, n:(ux) & sz==2} = b_getBatchLo{sz, x, n} & 3 -def b_getBatch{sz, x:*u64, n:(ux) & sz==4} = b_getBatchLo{sz, x, n} & 15 -def b_getBatch{sz, x:*u64, n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n} +def b_getBatch{sz, x:(*u64), n:(ux) & sz==2} = b_getBatchLo{sz, x, n} & 3 +def b_getBatch{sz, x:(*u64), n:(ux) & sz==4} = b_getBatchLo{sz, x, n} & 15 +def b_getBatch{sz, x:(*u64), n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n} -def b_set{x:*u64, n:(ux), v:u1} = { +def b_set{x:(*u64), n:(ux), v:(u1)} = { m:u64 = cast{u64,1}<<(n&63) p:u64 = load{x,n>>6} if (v) store{x,n>>6,p | m} else store{x,n>>6,p & ~m} } -def b_setBatch{sz, x:*u64, n:(ux), v} = { +def b_setBatch{sz, x:(*u64), n:(ux), v} = { vc:u64 = promote{u64,v} am:u64 = 64/sz w:u64 = load{x,n/am} @@ -30,7 +30,7 @@ def b_setBatch{sz, x:*u64, n:(ux), v} = { store{x, n/am, w} } -def b_setBatch{sz, x:*u64, n:(ux), v & sz==4} = { +def b_setBatch{sz, x:(*u64), n:(ux), v & sz==4} = { x8:= *u8 ~~ x #w:u64 = cast_i{u64, load{x8,n/2}} @@ -49,12 +49,12 @@ def b_setBatch{sz, x:*u64, n:(ux), v & sz==4} = { store{x8, n/2, cast_i{u8,w}} } -def b_setBatch{sz, x:*u64, n:(ux), v & sz== 8} = store{*u8 ~~ x, n, cast_i{u8, v}} -def b_setBatch{sz, x:*u64, n:(ux), v & sz==16} = store{*u16 ~~ x, n, cast_i{u16,v}} -def b_setBatch{sz, x:*u64, n:(ux), v & sz==32} = store{*u32 ~~ x, n, cast_i{u32,v}} -def b_setBatch{sz, x:*u64, n:(ux), v & sz==64} = store{ x, n, cast_i{u64,v}} +def b_setBatch{sz, x:(*u64), n:(ux), v & sz== 8} = store{*u8 ~~ x, n, cast_i{u8, v}} +def b_setBatch{sz, x:(*u64), n:(ux), v & sz==16} = store{*u16 ~~ x, n, cast_i{u16,v}} +def b_setBatch{sz, x:(*u64), n:(ux), v & sz==32} = store{*u32 ~~ x, n, cast_i{u32,v}} +def b_setBatch{sz, x:(*u64), n:(ux), v & sz==64} = store{ x, n, cast_i{u64,v}} -def spreadBits{T==[32]u8, a:u32} = { +def spreadBits{T==[32]u8, a:(u32)} = { def idxs = iota{32} b:= [8]u32**a c:= [32]u8~~b @@ -63,11 +63,11 @@ def spreadBits{T==[32]u8, a:u32} = { e == (d&e) } -def spreadBits{T==[16]u8, a:u16 & hasarch{'AARCH64'}} = { +def spreadBits{T==[16]u8, a:(u16) & hasarch{'AARCH64'}} = { b:= sel{[16]u8, [16]u8~~[8]u16**a, make{[16]i8, iota{16}>=8}} andnz{b, make{[16]u8, 1<<(iota{16}&7)}} } -def spreadBits{T==[16]u8, a:u16 & hasarch{'X86_64'}} = { +def spreadBits{T==[16]u8, a:(u16) & hasarch{'X86_64'}} = { b:= [16]u8~~[8]u16**a exp:= [16]u8~~shuf{[4]i32, shuf16Lo{mzipLo{b, b}, 4b1100}, 4b1100} (exp & make{[16]u8, 1<<(iota{16}&7)}) != [16]u8**0 @@ -78,22 +78,22 @@ def spreadBits{T, a & vcount{T} <= elwidth{T} & quality{eltype{T}}=='u'} = { b == (b & T ~~ re_el{type{a}, T}**a) # not just T**a so that if a is read from RAM, it can use the single instruction for broadcasting from RAM; the extra bits don't matter } -def loadBatchBit{T, x:*u64, n:(ux)} = { # vector with type T with each element being either all 0s or 1s +def loadBatchBit{T, x:(*u64), n:(ux)} = { # vector with type T with each element being either all 0s or 1s spreadBits{T, b_getBatchLo{vcount{T}, x, n}} } # load bits starting at bit i, leaving garbage at the top. Only the bottom 57 bits are guaranteed correct; 58 and 60 will be correct if `i` is a multiple of it -def loaduBitRaw{x:*u64, i} = { +def loaduBitRaw{x:(*u64), i} = { loadu{*u64~~((*u8~~x) + (i>>3))} >> (i&7) } -def loaduBit{x:*u64, i, n} = { +def loaduBit{x:(*u64), i, n} = { assert{(n<58) | (((n==58) | (n==60)) & (i%n == 0))} loaduBitRaw{x, i} } -def loaduBitTrunc{x:*u64, i, n & knum{n}} = truncBits{n, loaduBit{x, i, n}} +def loaduBitTrunc{x:(*u64), i, n & knum{n}} = truncBits{n, loaduBit{x, i, n}} -def loadBatchBit{T, x:*u64, is & ktup{is}} = { +def loadBatchBit{T, x:(*u64), is & ktup{is}} = { # def len = tuplen{is} # def count = vcount{T} # assert{count*len <= 64} diff --git a/src/singeli/src/bmi2.singeli b/src/singeli/src/bmi2.singeli index cf13020e..640ca3bd 100644 --- a/src/singeli/src/bmi2.singeli +++ b/src/singeli/src/bmi2.singeli @@ -1,4 +1,4 @@ -def pdep{x:u64, m:u64} = emit{u64, '_pdep_u64', x, m} -def pdep{x:u32, m:u32} = emit{u32, '_pdep_u32', x, m} -def pext{x:u64, m:u64} = emit{u64, '_pext_u64', x, m} -def pext{x:u32, m:u32} = emit{u32, '_pext_u32', x, m} +def pdep{x:(u64), m:(u64)} = emit{u64, '_pdep_u64', x, m} +def pdep{x:(u32), m:(u32)} = emit{u32, '_pdep_u32', x, m} +def pext{x:(u64), m:(u64)} = emit{u64, '_pext_u64', x, m} +def pext{x:(u32), m:(u32)} = emit{u32, '_pext_u32', x, m} diff --git a/src/singeli/src/cmp.singeli b/src/singeli/src/cmp.singeli index 23d1aeca..e47db6ca 100644 --- a/src/singeli/src/cmp.singeli +++ b/src/singeli/src/cmp.singeli @@ -4,11 +4,11 @@ include './f64' include './bitops' -def fillbits{dst:*u64, len:(ux), v } = { emit{void, 'fillBits', dst, len, v }; return{}; } -def fillbits{dst:*u64, len:(ux), v, x} = { emit{void, 'fillBitsDec', dst, len, v, x}; return{}; } +def fillbits{dst:(*u64), len:(ux), v } = { emit{void, 'fillBits', dst, len, v }; return{}; } +def fillbits{dst:(*u64), len:(ux), v, x} = { emit{void, 'fillBitsDec', dst, len, v, x}; return{}; } def cmp_err{x} = { emit{void, 'cmp_err'}; return{}; } -fn cmpIX(dst:*u64, len:ux, x:u64, v:u1) : void = { +fn cmpIX(dst:(*u64), len:ux, x:(u64), v:(u1)) : void = { nan:u1 = q_f64{x} if (~(nan | q_chr{x})) cmp_err{x} fillbits{dst, len, v&~nan, x} @@ -60,7 +60,7 @@ def pathAS{dst, len, T, op, x & isunsigned{T}} = { -def any2bit{VT, unr, op0, wS, wV, xS, xV, dst:*u64, len:(ux)} = { +def any2bit{VT, unr, op0, wS, wV, xS, xV, dst:(*u64), len:(ux)} = { def bulk = vcount{VT}*unr xi:ux = 0 diff --git a/src/singeli/src/copy.singeli b/src/singeli/src/copy.singeli index cf70423d..450cc1aa 100644 --- a/src/singeli/src/copy.singeli +++ b/src/singeli/src/copy.singeli @@ -3,7 +3,7 @@ include './mask' include './cbqnDefs' include './bitops' -def copyFromBits{T, loadFn, rp, l:u64} = { +def copyFromBits{T, loadFn, rp, l:(u64)} = { def bulk = vcount{T} def TU = ty_u{T} diff --git a/src/singeli/src/debug.singeli b/src/singeli/src/debug.singeli index 4614fb4a..12578928 100644 --- a/src/singeli/src/debug.singeli +++ b/src/singeli/src/debug.singeli @@ -1,4 +1,4 @@ include 'debug/printf' # printf & lprintf -def assert{x:u1} = { if (not x) emit{void, '__builtin_trap'} } +def assert{x:(u1)} = { if (not x) emit{void, '__builtin_trap'} } def test_assert = assert # test_assert is guaranteed to either not exist, or always trap on bad input diff --git a/src/singeli/src/f64.singeli b/src/singeli/src/f64.singeli index 646e54c4..00dda936 100644 --- a/src/singeli/src/f64.singeli +++ b/src/singeli/src/f64.singeli @@ -1,15 +1,15 @@ -def ceil{x:f64} = emit{f64, 'ceil', x} -def floor{x:f64} = emit{f64, 'floor', x} -def abs{x:f64} = emit{f64, 'fabs', x} +def ceil{x:(f64)} = emit{f64, 'ceil', x} +def floor{x:(f64)} = emit{f64, 'floor', x} +def abs{x:(f64)} = emit{f64, 'fabs', x} def NaN = 0.0/0.0 -def isNaN{x:f64} = x!=x -def qNaN{x:u64} = (x<<1) == (cast{u64, 0x8ff8} << 49) +def isNaN{x:(f64)} = x!=x +def qNaN{x:(u64)} = (x<<1) == (cast{u64, 0x8ff8} << 49) -def ftrunc{T, x:f64 & i8==T} = emit{i8, '', x} -def ftrunc{T, x:f64 & i16==T} = emit{i16, '', x} -def ftrunc{T, x:f64 & i32==T} = emit{i32, '', x} # maybe explicitly use _mm_cvtsd_si32? -def ftrunc{T, x:f64 & i64==T} = emit{i64, '', x} +def ftrunc{T, x:(f64) & i8==T} = emit{i8, '', x} +def ftrunc{T, x:(f64) & i16==T} = emit{i16, '', x} +def ftrunc{T, x:(f64) & i32==T} = emit{i32, '', x} # maybe explicitly use _mm_cvtsd_si32? +def ftrunc{T, x:(f64) & i64==T} = emit{i64, '', x} def fext{x} = emit{f64, '', x} -def interp_f64{x:u64} = emit{f64, 'interp_f64', x}
\ No newline at end of file +def interp_f64{x:(u64)} = emit{f64, 'interp_f64', x} diff --git a/src/singeli/src/fold.singeli b/src/singeli/src/fold.singeli index f513360d..bb132d4e 100644 --- a/src/singeli/src/fold.singeli +++ b/src/singeli/src/fold.singeli @@ -1,10 +1,10 @@ include './base' include './mask' -def opsh64{op}{v:[4]f64, perm} = op{v, shuf{[4]u64, v, perm}} -def opsh32{op}{v:[2]f64, perm} = op{v, shuf{[4]u32, v, perm}} -def mix{op, v:[4]f64 & hasarch{'AVX'}} = { def sh=opsh64{op}; sh{sh{v, 4b2301}, 4b1032} } -def mix{op, v:[2]f64 & hasarch{'X86_64'}} = opsh32{op}{v, 4b1032} +def opsh64{op}{v:([4]f64), perm} = op{v, shuf{[4]u64, v, perm}} +def opsh32{op}{v:([2]f64), perm} = op{v, shuf{[4]u32, v, perm}} +def mix{op, v:([4]f64) & hasarch{'AVX'}} = { def sh=opsh64{op}; sh{sh{v, 4b2301}, 4b1032} } +def mix{op, v:([2]f64) & hasarch{'X86_64'}} = opsh32{op}{v, 4b1032} def reduce_pairwise{op, plog, x:*T, len, init:T} = { # Pairwise combination to shorten dependency chains diff --git a/src/singeli/src/hashtab.singeli b/src/singeli/src/hashtab.singeli index 411a8d5d..a14cc5dd 100644 --- a/src/singeli/src/hashtab.singeli +++ b/src/singeli/src/hashtab.singeli @@ -15,13 +15,13 @@ def memset{p:pT, v, l} = { # These hashes are stored in tables and must be invertible! # Murmur3 -def hash_val{x0:u32} = { +def hash_val{x0:(u32)} = { x := x0 x ^= x >> 16; x *= 0x85ebca6b x ^= x >> 13; x *= 0xc2b2ae35 x ^= x >> 16; x } -def hash_val{x0:u64} = { +def hash_val{x0:(u64)} = { x := x0 x ^= x >> 33; x *= 0xff51afd7ed558ccd x ^= x >> 33; x *= 0xc4ceb9fe1a85ec53 @@ -29,7 +29,7 @@ def hash_val{x0:u64} = { } # CRC32 if (hasarch{'SSE4.2'}) require{'x86intrin.h'} -def hash_val{x:u32 & hasarch{'SSE4.2'}} = { +def hash_val{x:(u32) & hasarch{'SSE4.2'}} = { emit{u32, '_mm_crc32_u32', 0x973afb51, x} } diff --git a/src/singeli/src/mask.singeli b/src/singeli/src/mask.singeli index 60232f3a..598bb439 100644 --- a/src/singeli/src/mask.singeli +++ b/src/singeli/src/mask.singeli @@ -70,8 +70,8 @@ def storeBatch{ptr:P, ns, xs, M & istup{ns}} = each{{n,x} => storeBatch{ptr, n, # "harmless" pointer cast that'll only cast void* def hCast{T,p} = assert{show{'expected pointer with element',T,'or void but got ',p}} -def hCast{T,p:P & same{T,eltype{P}}} = p -def hCast{T,p:P & same{P,*void}} = *T~~p +def hCast{T,p:*T} = p +def hCast{T,p:(*void)} = *T~~p def mlExec{i, iter, vars0, bulk, M} = { def vproc{p:P & isptr{P}} = p diff --git a/src/singeli/src/replicate.singeli b/src/singeli/src/replicate.singeli index 98a9f428..45074114 100644 --- a/src/singeli/src/replicate.singeli +++ b/src/singeli/src/replicate.singeli @@ -20,7 +20,7 @@ def scan_core{upd, set, scan, rp:pT, wp:W, s:(usz)} = { k = e } } -def indrep_by_sum{T, rp:*T, wp, s:(usz), js, inc} = { +def indrep_by_sum{T, rp:(*T), wp, s:(usz), js, inc} = { def scan{ptr, len} = @for (ptr over len) js=ptr+=js def scan{ptr, len & width{T}<=32} = { def scanfn = merge{'si_scan_pluswrap_u',fmtnat{width{T}}} @@ -45,7 +45,7 @@ fn ind_by_scan_i32{W}(xv:*void, rp:*i32, s:usz) : void = { } } -def rep_by_scan{T, wp, xv:*void, rv:*void, s} = { +def rep_by_scan{T, wp, xv:(*void), rv:(*void), s} = { xp := *T~~xv; js := *xp; px := js def inc{j} = {sx:=px; px=load{xp,j}; px-sx} indrep_by_sum{T, *T~~rv, wp, s, js, inc} @@ -101,7 +101,7 @@ def rcsh4_dom = replicate{bind{>=,64}, replicate{fact_tab==1, fact_inds}} rcsh4_dat:*i8 = join{join{each{get_shuf_data{., 4}, rcsh4_dom}}} rcsh4_lkup:*i8 = shiftright{0, scan{+, fold{|, table{==, rcsh4_dom, iota{64}}}}} -def read_shuf_vecs{l, ellw:u64, shp:P} = { # tuple of byte selectors in 1<<ellw +def read_shuf_vecs{l, ellw:(u64), shp:P} = { # tuple of byte selectors in 1<<ellw def V = eltype{P} def double{x:X & hasarch{'AVX2'}} = { s:=shuf{[4]u64, x, 4b3120}; s+=s @@ -126,7 +126,7 @@ def read_shuf_vecs{l, ellw:u64, shp:P} = { # tuple of byte selectors in 1<<ellw sh } -def rep_const_shuffle{V, wv, onreps, xv:*V, rv:*V, n:u64} = { # onreps{inputVector, {nextOutputVector} => ...} +def rep_const_shuffle{V, wv, onreps, xv:*V, rv:*V, n:(u64)} = { # onreps{inputVector, {nextOutputVector} => ...} def step = vcount{V} nv := n / step j:u64 = 0 @@ -168,7 +168,7 @@ if (hasarch{'AVX2'}) { {x, gen} => each{{s}=>gen{shuf{V, x, s}}, sh} } - def rep_const_shuffle{V, wv, xv:*V, rv:*V, n:u64} = rep_const_shuffle{V, wv, get_rep_iter{V, wv}, xv, rv, n} + def rep_const_shuffle{V, wv, xv:*V, rv:*V, n:(u64)} = rep_const_shuffle{V, wv, get_rep_iter{V, wv}, xv, rv, n} } else if (hasarch{'AARCH64'}) { @@ -176,7 +176,7 @@ if (hasarch{'AVX2'}) { each{{s} => gen{sel{[16]u8, x, s}}, sh} } - def rep_const_shuffle{V, wv==2, xv0:*V, rv0:*V, n:u64} = { + def rep_const_shuffle{V, wv==2, xv0:*V, rv0:*V, n:(u64)} = { def E = ty_u{eltype{V}} rv:= *E~~rv0 @for (x in *E~~xv0 over i to n) { # autovectorized well enough, probably @@ -240,7 +240,7 @@ fn rep_const_shuffle_any(wv:u64, ellw:u64, x:*i8, r:*i8, n:u64) : void = { each{try, rcsh_vals} } -def rep_const_broadcast{T, kv, loop, wv:u64, x:*T, r:*T, n:u64} = { +def rep_const_broadcast{T, kv, loop, wv:(u64), x:*T, r:*T, n:(u64)} = { assert{kv > 0} def V = [arch_defvw/width{T}]T @for (x over n) { diff --git a/src/singeli/src/scan.singeli b/src/singeli/src/scan.singeli index fed3f288..dcf50ec4 100644 --- a/src/singeli/src/scan.singeli +++ b/src/singeli/src/scan.singeli @@ -12,7 +12,7 @@ fn scan_scal{T, op}(x:*T, r:*T, len:u64, m:T) : void = { @for (x, r over len) r = m = op{m, x} } -def scan_loop{T, init, x:*T, r:*T, len:u64, scan, scan_last} = { +def scan_loop{T, init, x:*T, r:*T, len:(u64), scan, scan_last} = { def step = arch_defvw/width{T} def V = [step]T p:= V**init @@ -23,7 +23,7 @@ def scan_loop{T, init, x:*T, r:*T, len:u64, scan, scan_last} = { q:= len & (step-1) if (q!=0) homMaskStoreF{rv+e, maskOf{V, q}, scan_last{load{xv,e}, p}} } -def scan_post{T, init, x:*T, r:*T, len:u64, op, pre} = { +def scan_post{T, init, x:*T, r:*T, len:(u64), op, pre} = { def last{v, p} = op{pre{v}, p} def scan{v, p} = { n:= last{v, p} @@ -149,7 +149,7 @@ fn bcs{T & hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = { def sums{n} = (if (n==0) tup{0}; else { def s=sums{n-1}; merge{s,s+1} }) def widen{v:T} = unpackQ{shuf{[4]u64, v, 4b3120}, T**0} - def sumlanes{x:u32} = { + def sumlanes{x:(u32)} = { b:= [8]u32**x >> make{[8]u32, 4*tail{1, iota{8}}} s:= sel8{[32]u8~~b, ii32>>3 + bit{2}} p:= s & make{[32]u8, (1<<(1+tail{2})) - 1} # Prefixes @@ -157,12 +157,12 @@ fn bcs{T & hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = { d+= sel8{d, bit{2}*(1+bit{3}>>2)-1} d + sel8{d, bit{3}-1} } - def step{x:u32, i, store1} = { + def step{x:(u32), i, store1} = { d:= sumlanes{x} if (w==8) d+= [32]u8~~shuf{[4]u64, [8]i32~~sel8{d, bit{3}<<4-1}, 4b1100} j:= (w/8)*i def out{v, k} = each{out, widen{v}, 2*k+iota{2}} - def out{v0:[vl]T, k} = { + def out{v0:([vl]T), k} = { v := V~~v0 + c # Update carry at the lane boundary if (w!=32 or tail{1,k}) { @@ -239,7 +239,7 @@ fn plus_scan{X, R, O}(x:*X, c:R, r:*R, len:u64) : O = { len } # Sum as many vector registers as possible; modifies c and i -def simd_plus_scan_part{X, R}{x:*X, c:(R), r:*R, len:u64, i:u64} = { +def simd_plus_scan_part{X, R}{x:(*X), c:(R), r:(*R), len:(u64), i:(u64)} = { def b = max{width{R}/2, width{X}} def bulk = arch_defvw/b diff --git a/src/singeli/src/search.singeli b/src/singeli/src/search.singeli index 0e4d0287..bb7c22df 100644 --- a/src/singeli/src/search.singeli +++ b/src/singeli/src/search.singeli @@ -18,7 +18,7 @@ def findFirst{C, M, F, ...v1} = { F{...args} } -def search{E, x, n:u64, OP} = { +def search{E, x, n:(u64), OP} = { def bulk = arch_defvw/width{E} def VT = [bulk]E def end = makeBranch{ @@ -111,7 +111,7 @@ def readbytes{vtab}{} = { } # Look up bits from table -def bittab_lookup{x0:*void, n:u64, r0:*void, tab:*void} = { +def bittab_lookup{x0:(*void), n:(u64), r0:(*void), tab:(*void)} = { x:= *u8~~x0 t:= *TI~~tab r:= *u64~~r0 @@ -124,7 +124,7 @@ def bittab_lookup{x0:*void, n:u64, r0:*void, tab:*void} = { x+=k; rem-=k; ++r } } -def bittab_lookup{x0:*void, n:u64, r0:*void, tab:*void & simd_bittab} = { +def bittab_lookup{x0:(*void), n:(u64), r0:(*void), tab:(*void) & simd_bittab} = { def {bitsel, _} = bittab_selector{readbytes{*VI~~tab}} def k = vcount{VI} @for (x in *VI~~x0, r in *ty_u{k}~~r0 over cdiv{n,k}) r = bitsel{x} @@ -139,7 +139,7 @@ def bittab_lookup{x0:*void, n:u64, r0:*void, tab:*void & simd_bittab} = { # - 'mask': Mark Firsts of x0 # - 'unique': Deduplicate of x0 # - 'index': First index of value x at r0+x -def do_bittab{x0:*void, n:u64, tab:*void, u:u8, t, mode, r0} = { +def do_bittab{x0:(*void), n:(u64), tab:(*void), u:(u8), t, mode, r0} = { def rbit = mode == 'mask' def rval = mode == 'unique' def rind = mode == 'index' diff --git a/src/singeli/src/select.singeli b/src/singeli/src/select.singeli index 87724f94..a290baa5 100644 --- a/src/singeli/src/select.singeli +++ b/src/singeli/src/select.singeli @@ -9,11 +9,11 @@ include 'util/tup' # def:T - masked original content # b:B - pointer to data to index; if width{B}<elwidth{T}, padding bytes are garbage read after wanted position # idx - actual (unscaled) index list -def gather{d:T, b:B, idx:[8]i32, M & w256{T,32}} = { +def gather{d:T, b:B, idx:([8]i32), M & w256{T,32}} = { if (M{0}) T ~~ emit{[8]i32, '_mm256_mask_i32gather_epi32', d, *void~~b, idx, M{T,'to sign bits'}, elwidth{B}/8} else T ~~ emit{[8]i32, '_mm256_i32gather_epi32', *void~~b, idx, elwidth{B}/8} } -def gather{d:T, b:B, idx:[4]i32, M & w256{T,64}} = { +def gather{d:T, b:B, idx:([4]i32), M & w256{T,64}} = { if (M{0}) T ~~ emit{[4]i64, '_mm256_mask_i32gather_epi64', d, *void~~b, idx, M{T,'to sign bits'}, elwidth{B}/8} else T ~~ emit{[4]i64, '_mm256_i32gather_epi64', *void~~b, idx, elwidth{B}/8} } @@ -164,4 +164,4 @@ fn avx2_select_bool128(w0:*void, x0:*void, r0:*void, wl:u64, xl:u64) : u1 = { } export{'avx2_select_bool128', avx2_select_bool128} -}
\ No newline at end of file +} diff --git a/src/singeli/src/sse2.singeli b/src/singeli/src/sse2.singeli index 2bf09a84..8e82ad46 100644 --- a/src/singeli/src/sse2.singeli +++ b/src/singeli/src/sse2.singeli @@ -40,8 +40,8 @@ def __mul{a:T,b:T & [4]i32==T} = { } # float arith -def rsqrtE{a:[4]f32} = emit{[4]f32, '_mm_rsqrt_ps', a} -def rcpE{a:[4]f32} = emit{[4]f32, '_mm_rcp_ps', a} +def rsqrtE{a:([4]f32)} = emit{[4]f32, '_mm_rsqrt_ps', a} +def rcpE{a:([4]f32)} = emit{[4]f32, '_mm_rcp_ps', a} # mask stuff def andAllZero{x:T, y:T & w128i{T}} = homAll{(x & y) == T**0} @@ -94,15 +94,15 @@ def widen{T==[2]f64, x:X & w128s{X} & elwidth{X}<32} = widen{T, widen{[4]i32, x} def widen{T==[2]f64, x:X & X==[4]i32} = emit{T, '_mm_cvtepi32_pd', x} def widen{T==[2]f64, x:X & X==[4]f32} = emit{T, '_mm_cvtps_pd', x} -def narrow{T==i16, x:[4]i32} = packs{x,x} -def narrow{T==i8, x:[8]i16} = packs{x,x} -def narrow{T==u8, x:[8]u16} = packs{x,x} -def narrow{T==u16, x:[4]u32} = [8]u16~~shuf{[4]i32, shuf16Hi{shuf16Lo{x, 4b3320}, 4b3320}, 4b3320} -def narrow{T==i8, x:[4]i32} = narrow{T, narrow{i16, x}} -def narrow{T==u8, x:[4]u32} = { def f{v} = narrow{u8, [8]u16~~v}; f{f{x}}} -def narrow{T==u8, x:[2]u64} = { def f{v} = narrow{u8, [8]u16~~v}; f{f{f{x}}}} -def narrow{T==u16, x:[2]u64} = shuf16Lo{[8]u16~~shuf{[4]i32, x, 4b3320}, 4b3320} -def narrow{T==u32, x:[2]u64} = [4]u32~~shuf{[4]i32, x, 4b3320} +def narrow{T==i16, x:([4]i32)} = packs{x,x} +def narrow{T==i8, x:([8]i16)} = packs{x,x} +def narrow{T==u8, x:([8]u16)} = packs{x,x} +def narrow{T==u16, x:([4]u32)} = [8]u16~~shuf{[4]i32, shuf16Hi{shuf16Lo{x, 4b3320}, 4b3320}, 4b3320} +def narrow{T==i8, x:([4]i32)} = narrow{T, narrow{i16, x}} +def narrow{T==u8, x:([4]u32)} = { def f{v} = narrow{u8, [8]u16~~v}; f{f{x}}} +def narrow{T==u8, x:([2]u64)} = { def f{v} = narrow{u8, [8]u16~~v}; f{f{f{x}}}} +def narrow{T==u16, x:([2]u64)} = shuf16Lo{[8]u16~~shuf{[4]i32, x, 4b3320}, 4b3320} +def narrow{T==u32, x:([2]u64)} = [4]u32~~shuf{[4]i32, x, 4b3320} def narrow{T, x:X & w128f{X,64} & T<i32} = narrow{T, narrow{i32, x}} -def narrow{T==i32, x:[2]f64} = emit{[4]i32, '_mm_cvtpd_epi32', x} +def narrow{T==i32, x:([2]f64)} = emit{[4]i32, '_mm_cvtpd_epi32', x} |