summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarshall Lochbaum <mwlochbaum@gmail.com>2024-05-10 08:30:08 -0400
committerMarshall Lochbaum <mwlochbaum@gmail.com>2024-05-10 08:30:08 -0400
commit526d9bbebe4e5f5539cc6a4ca4978927fbb5feab (patch)
treeb47724fca7536bde8c974f644a5414fc07a3a3ad
parent0b1ba06bc0d33d91058a4337641e385dfd0752b7 (diff)
Wrap non-destructuring type parameters in parens
-rw-r--r--src/singeli/src/avx.singeli4
-rw-r--r--src/singeli/src/avx2.singeli8
-rw-r--r--src/singeli/src/base.singeli16
-rw-r--r--src/singeli/src/bins.singeli2
-rw-r--r--src/singeli/src/bitops.singeli44
-rw-r--r--src/singeli/src/bmi2.singeli8
-rw-r--r--src/singeli/src/cmp.singeli8
-rw-r--r--src/singeli/src/copy.singeli2
-rw-r--r--src/singeli/src/debug.singeli2
-rw-r--r--src/singeli/src/f64.singeli20
-rw-r--r--src/singeli/src/fold.singeli8
-rw-r--r--src/singeli/src/hashtab.singeli6
-rw-r--r--src/singeli/src/mask.singeli4
-rw-r--r--src/singeli/src/replicate.singeli14
-rw-r--r--src/singeli/src/scan.singeli12
-rw-r--r--src/singeli/src/search.singeli8
-rw-r--r--src/singeli/src/select.singeli6
-rw-r--r--src/singeli/src/sse2.singeli24
18 files changed, 98 insertions, 98 deletions
diff --git a/src/singeli/src/avx.singeli b/src/singeli/src/avx.singeli
index c3c645b0..b4b6894f 100644
--- a/src/singeli/src/avx.singeli
+++ b/src/singeli/src/avx.singeli
@@ -20,8 +20,8 @@ def unord{a:T,b:T & T==[8]f32} = f32cmpAVX{a,b,3}
def unord{a:T,b:T & T==[4]f64} = f64cmpAVX{a,b,3}
# f32 arith
-def rsqrtE{a:[8]f32} = emit{[8]f32, '_mm256_rsqrt_ps', a}
-def rcpE{a:[8]f32} = emit{[8]f32, '_mm256_rcp_ps', a}
+def rsqrtE{a:([8]f32)} = emit{[8]f32, '_mm256_rsqrt_ps', a}
+def rcpE{a:([8]f32)} = emit{[8]f32, '_mm256_rcp_ps', a}
# conversion
def half{x:T, i & w256{T} & knum{i}} = n_h{T} ~~ emit{[8]i16, '_mm256_extracti128_si256', v2i{x}, i}
diff --git a/src/singeli/src/avx2.singeli b/src/singeli/src/avx2.singeli
index 3e317ace..d50e1543 100644
--- a/src/singeli/src/avx2.singeli
+++ b/src/singeli/src/avx2.singeli
@@ -1,8 +1,8 @@
# questionable pack
-def unpackQ{a:[32]i8, b:[32]i8 } = { tup{emit{[16]i16, '_mm256_unpacklo_epi8', a, b}, emit{[16]i16, '_mm256_unpackhi_epi8', a, b}}}
-def unpackQ{a:[16]i16, b:[16]i16} = { tup{emit{[ 8]i32, '_mm256_unpacklo_epi16', a, b}, emit{[ 8]i32, '_mm256_unpackhi_epi16', a, b}}}
-def unpackQ{a:[ 8]i32, b:[ 8]i32} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi32', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi32', a, b}}}
-def unpackQ{a:[ 4]i64, b:[ 4]i64} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi64', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi64', a, b}}}
+def unpackQ{a:T,b:T & T==[32]i8 } = { tup{emit{[16]i16, '_mm256_unpacklo_epi8', a, b}, emit{[16]i16, '_mm256_unpackhi_epi8', a, b}}}
+def unpackQ{a:T,b:T & T==[16]i16} = { tup{emit{[ 8]i32, '_mm256_unpacklo_epi16', a, b}, emit{[ 8]i32, '_mm256_unpackhi_epi16', a, b}}}
+def unpackQ{a:T,b:T & T==[ 8]i32} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi32', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi32', a, b}}}
+def unpackQ{a:T,b:T & T==[ 4]i64} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi64', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi64', a, b}}}
# inverse of questionable pack; these saturate the argument
def packQ{a:T,b:T & T==[16]i16} = emit{[32]i8, '_mm256_packs_epi16', a, b}
def packQ{a:T,b:T & T==[ 8]i32} = emit{[16]i16, '_mm256_packs_epi32', a, b}
diff --git a/src/singeli/src/base.singeli b/src/singeli/src/base.singeli
index d5e11120..7bb4f8e2 100644
--- a/src/singeli/src/base.singeli
+++ b/src/singeli/src/base.singeli
@@ -32,12 +32,12 @@ def load{p:P, n & isvec{eltype{P}}} = assert{0}
def store{p:P, n, v & isvec{eltype{P}}} = assert{0}
def load{p:P & isptr{P}} = load{p, 0}
# def store{p:P, v & isptr{P}} = store{p, 0, v}
-def loadu{p:T & isunsigned{eltype{T}}} = emit{eltype{T}, merge{'loadu_u',fmtnat{elwidth{T}}}, p}
-def storeu{p:T, v:eltype{T} & isunsigned{eltype{T}}} = emit{void, merge{'storeu_u',fmtnat{elwidth{T}}}, p, v}
-def loadu{p:T & issigned{eltype{T}}} = loadu {*ty_u{eltype{T}} ~~ p}
-def storeu{p:T, v:eltype{T} & issigned{eltype{T}}} = storeu{*ty_u{eltype{T}} ~~ p, ty_u{v}}
-def loadu{p:T & elwidth{T}==8} = load{p}
-def storeu{p:T, v:eltype{T} & elwidth{T}==8} = store{p, v}
+def loadu{p:T & isunsigned{eltype{T}}} = emit{eltype{T}, merge{'loadu_u',fmtnat{elwidth{T}}}, p}
+def storeu{p:T, v:(eltype{T}) & isunsigned{eltype{T}}} = emit{void, merge{'storeu_u',fmtnat{elwidth{T}}}, p, v}
+def loadu{p:T & issigned{eltype{T}}} = loadu {*ty_u{eltype{T}} ~~ p}
+def storeu{p:T, v:(eltype{T}) & issigned{eltype{T}}} = storeu{*ty_u{eltype{T}} ~~ p, ty_u{v}}
+def loadu{p:T & elwidth{T}==8} = load{p}
+def storeu{p:T, v:(eltype{T}) & elwidth{T}==8} = store{p, v}
def reinterpret{T, x:X & T==X} = x
@@ -47,11 +47,11 @@ def exportT{name, fs} = { v:*type{tupsel{0,fs}} = fs; export{name, v} }
# hints
def rare{x & knum{x}} = x
-def rare{x:u1} = emit{u1, '__builtin_expect', x, 0}
+def rare{x:(u1)} = emit{u1, '__builtin_expect', x, 0}
def assert{x & x==0} = assert{'failed assertion'}
def assert{x & x==1} = 1
def unreachable{} = emit{void, 'si_unreachable'}
-def assert{x:u1} = { if (not x) emit{void, 'si_unreachable'} }
+def assert{x:(u1)} = { if (not x) emit{void, 'si_unreachable'} }
# various checks
def oneVal{{h, ...t}} = {
diff --git a/src/singeli/src/bins.singeli b/src/singeli/src/bins.singeli
index adc56ef7..8aade258 100644
--- a/src/singeli/src/bins.singeli
+++ b/src/singeli/src/bins.singeli
@@ -111,7 +111,7 @@ fn write_indices{I,T}(t:*I, w:*T, n:u64) : void = {
fn write_indices{I,T & width{I}==8}(t:*I, w:*T, n:u64) : void = {
@for (w over j to n) store{t, w, cast_i{I, j+1}}
}
-def bins_lookup{I, T, up, w:*T, wn:u64, x:*T, xn:u64, rp:*void} = {
+def bins_lookup{I, T, up, w:*T, wn:(u64), x:*T, xn:(u64), rp:(*void)} = {
# Build table
def tc = 1<<width{T}
t0:*I = talloc{I, tc}
diff --git a/src/singeli/src/bitops.singeli b/src/singeli/src/bitops.singeli
index 8eb08149..e7818571 100644
--- a/src/singeli/src/bitops.singeli
+++ b/src/singeli/src/bitops.singeli
@@ -1,26 +1,26 @@
def ones{T, n} = (cast{T,1}<<cast{T,n}) - 1
-def b_get{x:*u64, n:(ux)} = {
+def b_get{x:(*u64), n:(ux)} = {
((load{x,n>>6}>>(n&63)) & 1) != 0
}
-def b_getBatchLo{sz, x:*u64, n:(ux) & sz==2} = (load{*u8~~x, n>>2} >> cast_i{u8, (n&3)*2})
-def b_getBatchLo{sz, x:*u64, n:(ux) & sz==4} = (load{*u8~~x, n>>1} >> cast_i{u8, (n&1)*4})
-def b_getBatchLo{sz, x:*u64, n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n}
+def b_getBatchLo{sz, x:(*u64), n:(ux) & sz==2} = (load{*u8~~x, n>>2} >> cast_i{u8, (n&3)*2})
+def b_getBatchLo{sz, x:(*u64), n:(ux) & sz==4} = (load{*u8~~x, n>>1} >> cast_i{u8, (n&1)*4})
+def b_getBatchLo{sz, x:(*u64), n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n}
-def b_getBatch{sz, x:*u64, n:(ux) & sz==2} = b_getBatchLo{sz, x, n} & 3
-def b_getBatch{sz, x:*u64, n:(ux) & sz==4} = b_getBatchLo{sz, x, n} & 15
-def b_getBatch{sz, x:*u64, n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n}
+def b_getBatch{sz, x:(*u64), n:(ux) & sz==2} = b_getBatchLo{sz, x, n} & 3
+def b_getBatch{sz, x:(*u64), n:(ux) & sz==4} = b_getBatchLo{sz, x, n} & 15
+def b_getBatch{sz, x:(*u64), n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n}
-def b_set{x:*u64, n:(ux), v:u1} = {
+def b_set{x:(*u64), n:(ux), v:(u1)} = {
m:u64 = cast{u64,1}<<(n&63)
p:u64 = load{x,n>>6}
if (v) store{x,n>>6,p | m}
else store{x,n>>6,p & ~m}
}
-def b_setBatch{sz, x:*u64, n:(ux), v} = {
+def b_setBatch{sz, x:(*u64), n:(ux), v} = {
vc:u64 = promote{u64,v}
am:u64 = 64/sz
w:u64 = load{x,n/am}
@@ -30,7 +30,7 @@ def b_setBatch{sz, x:*u64, n:(ux), v} = {
store{x, n/am, w}
}
-def b_setBatch{sz, x:*u64, n:(ux), v & sz==4} = {
+def b_setBatch{sz, x:(*u64), n:(ux), v & sz==4} = {
x8:= *u8 ~~ x
#w:u64 = cast_i{u64, load{x8,n/2}}
@@ -49,12 +49,12 @@ def b_setBatch{sz, x:*u64, n:(ux), v & sz==4} = {
store{x8, n/2, cast_i{u8,w}}
}
-def b_setBatch{sz, x:*u64, n:(ux), v & sz== 8} = store{*u8 ~~ x, n, cast_i{u8, v}}
-def b_setBatch{sz, x:*u64, n:(ux), v & sz==16} = store{*u16 ~~ x, n, cast_i{u16,v}}
-def b_setBatch{sz, x:*u64, n:(ux), v & sz==32} = store{*u32 ~~ x, n, cast_i{u32,v}}
-def b_setBatch{sz, x:*u64, n:(ux), v & sz==64} = store{ x, n, cast_i{u64,v}}
+def b_setBatch{sz, x:(*u64), n:(ux), v & sz== 8} = store{*u8 ~~ x, n, cast_i{u8, v}}
+def b_setBatch{sz, x:(*u64), n:(ux), v & sz==16} = store{*u16 ~~ x, n, cast_i{u16,v}}
+def b_setBatch{sz, x:(*u64), n:(ux), v & sz==32} = store{*u32 ~~ x, n, cast_i{u32,v}}
+def b_setBatch{sz, x:(*u64), n:(ux), v & sz==64} = store{ x, n, cast_i{u64,v}}
-def spreadBits{T==[32]u8, a:u32} = {
+def spreadBits{T==[32]u8, a:(u32)} = {
def idxs = iota{32}
b:= [8]u32**a
c:= [32]u8~~b
@@ -63,11 +63,11 @@ def spreadBits{T==[32]u8, a:u32} = {
e == (d&e)
}
-def spreadBits{T==[16]u8, a:u16 & hasarch{'AARCH64'}} = {
+def spreadBits{T==[16]u8, a:(u16) & hasarch{'AARCH64'}} = {
b:= sel{[16]u8, [16]u8~~[8]u16**a, make{[16]i8, iota{16}>=8}}
andnz{b, make{[16]u8, 1<<(iota{16}&7)}}
}
-def spreadBits{T==[16]u8, a:u16 & hasarch{'X86_64'}} = {
+def spreadBits{T==[16]u8, a:(u16) & hasarch{'X86_64'}} = {
b:= [16]u8~~[8]u16**a
exp:= [16]u8~~shuf{[4]i32, shuf16Lo{mzipLo{b, b}, 4b1100}, 4b1100}
(exp & make{[16]u8, 1<<(iota{16}&7)}) != [16]u8**0
@@ -78,22 +78,22 @@ def spreadBits{T, a & vcount{T} <= elwidth{T} & quality{eltype{T}}=='u'} = {
b == (b & T ~~ re_el{type{a}, T}**a) # not just T**a so that if a is read from RAM, it can use the single instruction for broadcasting from RAM; the extra bits don't matter
}
-def loadBatchBit{T, x:*u64, n:(ux)} = { # vector with type T with each element being either all 0s or 1s
+def loadBatchBit{T, x:(*u64), n:(ux)} = { # vector with type T with each element being either all 0s or 1s
spreadBits{T, b_getBatchLo{vcount{T}, x, n}}
}
# load bits starting at bit i, leaving garbage at the top. Only the bottom 57 bits are guaranteed correct; 58 and 60 will be correct if `i` is a multiple of it
-def loaduBitRaw{x:*u64, i} = {
+def loaduBitRaw{x:(*u64), i} = {
loadu{*u64~~((*u8~~x) + (i>>3))} >> (i&7)
}
-def loaduBit{x:*u64, i, n} = {
+def loaduBit{x:(*u64), i, n} = {
assert{(n<58) | (((n==58) | (n==60)) & (i%n == 0))}
loaduBitRaw{x, i}
}
-def loaduBitTrunc{x:*u64, i, n & knum{n}} = truncBits{n, loaduBit{x, i, n}}
+def loaduBitTrunc{x:(*u64), i, n & knum{n}} = truncBits{n, loaduBit{x, i, n}}
-def loadBatchBit{T, x:*u64, is & ktup{is}} = {
+def loadBatchBit{T, x:(*u64), is & ktup{is}} = {
# def len = tuplen{is}
# def count = vcount{T}
# assert{count*len <= 64}
diff --git a/src/singeli/src/bmi2.singeli b/src/singeli/src/bmi2.singeli
index cf13020e..640ca3bd 100644
--- a/src/singeli/src/bmi2.singeli
+++ b/src/singeli/src/bmi2.singeli
@@ -1,4 +1,4 @@
-def pdep{x:u64, m:u64} = emit{u64, '_pdep_u64', x, m}
-def pdep{x:u32, m:u32} = emit{u32, '_pdep_u32', x, m}
-def pext{x:u64, m:u64} = emit{u64, '_pext_u64', x, m}
-def pext{x:u32, m:u32} = emit{u32, '_pext_u32', x, m}
+def pdep{x:(u64), m:(u64)} = emit{u64, '_pdep_u64', x, m}
+def pdep{x:(u32), m:(u32)} = emit{u32, '_pdep_u32', x, m}
+def pext{x:(u64), m:(u64)} = emit{u64, '_pext_u64', x, m}
+def pext{x:(u32), m:(u32)} = emit{u32, '_pext_u32', x, m}
diff --git a/src/singeli/src/cmp.singeli b/src/singeli/src/cmp.singeli
index 23d1aeca..e47db6ca 100644
--- a/src/singeli/src/cmp.singeli
+++ b/src/singeli/src/cmp.singeli
@@ -4,11 +4,11 @@ include './f64'
include './bitops'
-def fillbits{dst:*u64, len:(ux), v } = { emit{void, 'fillBits', dst, len, v }; return{}; }
-def fillbits{dst:*u64, len:(ux), v, x} = { emit{void, 'fillBitsDec', dst, len, v, x}; return{}; }
+def fillbits{dst:(*u64), len:(ux), v } = { emit{void, 'fillBits', dst, len, v }; return{}; }
+def fillbits{dst:(*u64), len:(ux), v, x} = { emit{void, 'fillBitsDec', dst, len, v, x}; return{}; }
def cmp_err{x} = { emit{void, 'cmp_err'}; return{}; }
-fn cmpIX(dst:*u64, len:ux, x:u64, v:u1) : void = {
+fn cmpIX(dst:(*u64), len:ux, x:(u64), v:(u1)) : void = {
nan:u1 = q_f64{x}
if (~(nan | q_chr{x})) cmp_err{x}
fillbits{dst, len, v&~nan, x}
@@ -60,7 +60,7 @@ def pathAS{dst, len, T, op, x & isunsigned{T}} = {
-def any2bit{VT, unr, op0, wS, wV, xS, xV, dst:*u64, len:(ux)} = {
+def any2bit{VT, unr, op0, wS, wV, xS, xV, dst:(*u64), len:(ux)} = {
def bulk = vcount{VT}*unr
xi:ux = 0
diff --git a/src/singeli/src/copy.singeli b/src/singeli/src/copy.singeli
index cf70423d..450cc1aa 100644
--- a/src/singeli/src/copy.singeli
+++ b/src/singeli/src/copy.singeli
@@ -3,7 +3,7 @@ include './mask'
include './cbqnDefs'
include './bitops'
-def copyFromBits{T, loadFn, rp, l:u64} = {
+def copyFromBits{T, loadFn, rp, l:(u64)} = {
def bulk = vcount{T}
def TU = ty_u{T}
diff --git a/src/singeli/src/debug.singeli b/src/singeli/src/debug.singeli
index 4614fb4a..12578928 100644
--- a/src/singeli/src/debug.singeli
+++ b/src/singeli/src/debug.singeli
@@ -1,4 +1,4 @@
include 'debug/printf' # printf & lprintf
-def assert{x:u1} = { if (not x) emit{void, '__builtin_trap'} }
+def assert{x:(u1)} = { if (not x) emit{void, '__builtin_trap'} }
def test_assert = assert # test_assert is guaranteed to either not exist, or always trap on bad input
diff --git a/src/singeli/src/f64.singeli b/src/singeli/src/f64.singeli
index 646e54c4..00dda936 100644
--- a/src/singeli/src/f64.singeli
+++ b/src/singeli/src/f64.singeli
@@ -1,15 +1,15 @@
-def ceil{x:f64} = emit{f64, 'ceil', x}
-def floor{x:f64} = emit{f64, 'floor', x}
-def abs{x:f64} = emit{f64, 'fabs', x}
+def ceil{x:(f64)} = emit{f64, 'ceil', x}
+def floor{x:(f64)} = emit{f64, 'floor', x}
+def abs{x:(f64)} = emit{f64, 'fabs', x}
def NaN = 0.0/0.0
-def isNaN{x:f64} = x!=x
-def qNaN{x:u64} = (x<<1) == (cast{u64, 0x8ff8} << 49)
+def isNaN{x:(f64)} = x!=x
+def qNaN{x:(u64)} = (x<<1) == (cast{u64, 0x8ff8} << 49)
-def ftrunc{T, x:f64 & i8==T} = emit{i8, '', x}
-def ftrunc{T, x:f64 & i16==T} = emit{i16, '', x}
-def ftrunc{T, x:f64 & i32==T} = emit{i32, '', x} # maybe explicitly use _mm_cvtsd_si32?
-def ftrunc{T, x:f64 & i64==T} = emit{i64, '', x}
+def ftrunc{T, x:(f64) & i8==T} = emit{i8, '', x}
+def ftrunc{T, x:(f64) & i16==T} = emit{i16, '', x}
+def ftrunc{T, x:(f64) & i32==T} = emit{i32, '', x} # maybe explicitly use _mm_cvtsd_si32?
+def ftrunc{T, x:(f64) & i64==T} = emit{i64, '', x}
def fext{x} = emit{f64, '', x}
-def interp_f64{x:u64} = emit{f64, 'interp_f64', x} \ No newline at end of file
+def interp_f64{x:(u64)} = emit{f64, 'interp_f64', x}
diff --git a/src/singeli/src/fold.singeli b/src/singeli/src/fold.singeli
index f513360d..bb132d4e 100644
--- a/src/singeli/src/fold.singeli
+++ b/src/singeli/src/fold.singeli
@@ -1,10 +1,10 @@
include './base'
include './mask'
-def opsh64{op}{v:[4]f64, perm} = op{v, shuf{[4]u64, v, perm}}
-def opsh32{op}{v:[2]f64, perm} = op{v, shuf{[4]u32, v, perm}}
-def mix{op, v:[4]f64 & hasarch{'AVX'}} = { def sh=opsh64{op}; sh{sh{v, 4b2301}, 4b1032} }
-def mix{op, v:[2]f64 & hasarch{'X86_64'}} = opsh32{op}{v, 4b1032}
+def opsh64{op}{v:([4]f64), perm} = op{v, shuf{[4]u64, v, perm}}
+def opsh32{op}{v:([2]f64), perm} = op{v, shuf{[4]u32, v, perm}}
+def mix{op, v:([4]f64) & hasarch{'AVX'}} = { def sh=opsh64{op}; sh{sh{v, 4b2301}, 4b1032} }
+def mix{op, v:([2]f64) & hasarch{'X86_64'}} = opsh32{op}{v, 4b1032}
def reduce_pairwise{op, plog, x:*T, len, init:T} = {
# Pairwise combination to shorten dependency chains
diff --git a/src/singeli/src/hashtab.singeli b/src/singeli/src/hashtab.singeli
index 411a8d5d..a14cc5dd 100644
--- a/src/singeli/src/hashtab.singeli
+++ b/src/singeli/src/hashtab.singeli
@@ -15,13 +15,13 @@ def memset{p:pT, v, l} = {
# These hashes are stored in tables and must be invertible!
# Murmur3
-def hash_val{x0:u32} = {
+def hash_val{x0:(u32)} = {
x := x0
x ^= x >> 16; x *= 0x85ebca6b
x ^= x >> 13; x *= 0xc2b2ae35
x ^= x >> 16; x
}
-def hash_val{x0:u64} = {
+def hash_val{x0:(u64)} = {
x := x0
x ^= x >> 33; x *= 0xff51afd7ed558ccd
x ^= x >> 33; x *= 0xc4ceb9fe1a85ec53
@@ -29,7 +29,7 @@ def hash_val{x0:u64} = {
}
# CRC32
if (hasarch{'SSE4.2'}) require{'x86intrin.h'}
-def hash_val{x:u32 & hasarch{'SSE4.2'}} = {
+def hash_val{x:(u32) & hasarch{'SSE4.2'}} = {
emit{u32, '_mm_crc32_u32', 0x973afb51, x}
}
diff --git a/src/singeli/src/mask.singeli b/src/singeli/src/mask.singeli
index 60232f3a..598bb439 100644
--- a/src/singeli/src/mask.singeli
+++ b/src/singeli/src/mask.singeli
@@ -70,8 +70,8 @@ def storeBatch{ptr:P, ns, xs, M & istup{ns}} = each{{n,x} => storeBatch{ptr, n,
# "harmless" pointer cast that'll only cast void*
def hCast{T,p} = assert{show{'expected pointer with element',T,'or void but got ',p}}
-def hCast{T,p:P & same{T,eltype{P}}} = p
-def hCast{T,p:P & same{P,*void}} = *T~~p
+def hCast{T,p:*T} = p
+def hCast{T,p:(*void)} = *T~~p
def mlExec{i, iter, vars0, bulk, M} = {
def vproc{p:P & isptr{P}} = p
diff --git a/src/singeli/src/replicate.singeli b/src/singeli/src/replicate.singeli
index 98a9f428..45074114 100644
--- a/src/singeli/src/replicate.singeli
+++ b/src/singeli/src/replicate.singeli
@@ -20,7 +20,7 @@ def scan_core{upd, set, scan, rp:pT, wp:W, s:(usz)} = {
k = e
}
}
-def indrep_by_sum{T, rp:*T, wp, s:(usz), js, inc} = {
+def indrep_by_sum{T, rp:(*T), wp, s:(usz), js, inc} = {
def scan{ptr, len} = @for (ptr over len) js=ptr+=js
def scan{ptr, len & width{T}<=32} = {
def scanfn = merge{'si_scan_pluswrap_u',fmtnat{width{T}}}
@@ -45,7 +45,7 @@ fn ind_by_scan_i32{W}(xv:*void, rp:*i32, s:usz) : void = {
}
}
-def rep_by_scan{T, wp, xv:*void, rv:*void, s} = {
+def rep_by_scan{T, wp, xv:(*void), rv:(*void), s} = {
xp := *T~~xv; js := *xp; px := js
def inc{j} = {sx:=px; px=load{xp,j}; px-sx}
indrep_by_sum{T, *T~~rv, wp, s, js, inc}
@@ -101,7 +101,7 @@ def rcsh4_dom = replicate{bind{>=,64}, replicate{fact_tab==1, fact_inds}}
rcsh4_dat:*i8 = join{join{each{get_shuf_data{., 4}, rcsh4_dom}}}
rcsh4_lkup:*i8 = shiftright{0, scan{+, fold{|, table{==, rcsh4_dom, iota{64}}}}}
-def read_shuf_vecs{l, ellw:u64, shp:P} = { # tuple of byte selectors in 1<<ellw
+def read_shuf_vecs{l, ellw:(u64), shp:P} = { # tuple of byte selectors in 1<<ellw
def V = eltype{P}
def double{x:X & hasarch{'AVX2'}} = {
s:=shuf{[4]u64, x, 4b3120}; s+=s
@@ -126,7 +126,7 @@ def read_shuf_vecs{l, ellw:u64, shp:P} = { # tuple of byte selectors in 1<<ellw
sh
}
-def rep_const_shuffle{V, wv, onreps, xv:*V, rv:*V, n:u64} = { # onreps{inputVector, {nextOutputVector} => ...}
+def rep_const_shuffle{V, wv, onreps, xv:*V, rv:*V, n:(u64)} = { # onreps{inputVector, {nextOutputVector} => ...}
def step = vcount{V}
nv := n / step
j:u64 = 0
@@ -168,7 +168,7 @@ if (hasarch{'AVX2'}) {
{x, gen} => each{{s}=>gen{shuf{V, x, s}}, sh}
}
- def rep_const_shuffle{V, wv, xv:*V, rv:*V, n:u64} = rep_const_shuffle{V, wv, get_rep_iter{V, wv}, xv, rv, n}
+ def rep_const_shuffle{V, wv, xv:*V, rv:*V, n:(u64)} = rep_const_shuffle{V, wv, get_rep_iter{V, wv}, xv, rv, n}
} else if (hasarch{'AARCH64'}) {
@@ -176,7 +176,7 @@ if (hasarch{'AVX2'}) {
each{{s} => gen{sel{[16]u8, x, s}}, sh}
}
- def rep_const_shuffle{V, wv==2, xv0:*V, rv0:*V, n:u64} = {
+ def rep_const_shuffle{V, wv==2, xv0:*V, rv0:*V, n:(u64)} = {
def E = ty_u{eltype{V}}
rv:= *E~~rv0
@for (x in *E~~xv0 over i to n) { # autovectorized well enough, probably
@@ -240,7 +240,7 @@ fn rep_const_shuffle_any(wv:u64, ellw:u64, x:*i8, r:*i8, n:u64) : void = {
each{try, rcsh_vals}
}
-def rep_const_broadcast{T, kv, loop, wv:u64, x:*T, r:*T, n:u64} = {
+def rep_const_broadcast{T, kv, loop, wv:(u64), x:*T, r:*T, n:(u64)} = {
assert{kv > 0}
def V = [arch_defvw/width{T}]T
@for (x over n) {
diff --git a/src/singeli/src/scan.singeli b/src/singeli/src/scan.singeli
index fed3f288..dcf50ec4 100644
--- a/src/singeli/src/scan.singeli
+++ b/src/singeli/src/scan.singeli
@@ -12,7 +12,7 @@ fn scan_scal{T, op}(x:*T, r:*T, len:u64, m:T) : void = {
@for (x, r over len) r = m = op{m, x}
}
-def scan_loop{T, init, x:*T, r:*T, len:u64, scan, scan_last} = {
+def scan_loop{T, init, x:*T, r:*T, len:(u64), scan, scan_last} = {
def step = arch_defvw/width{T}
def V = [step]T
p:= V**init
@@ -23,7 +23,7 @@ def scan_loop{T, init, x:*T, r:*T, len:u64, scan, scan_last} = {
q:= len & (step-1)
if (q!=0) homMaskStoreF{rv+e, maskOf{V, q}, scan_last{load{xv,e}, p}}
}
-def scan_post{T, init, x:*T, r:*T, len:u64, op, pre} = {
+def scan_post{T, init, x:*T, r:*T, len:(u64), op, pre} = {
def last{v, p} = op{pre{v}, p}
def scan{v, p} = {
n:= last{v, p}
@@ -149,7 +149,7 @@ fn bcs{T & hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = {
def sums{n} = (if (n==0) tup{0}; else { def s=sums{n-1}; merge{s,s+1} })
def widen{v:T} = unpackQ{shuf{[4]u64, v, 4b3120}, T**0}
- def sumlanes{x:u32} = {
+ def sumlanes{x:(u32)} = {
b:= [8]u32**x >> make{[8]u32, 4*tail{1, iota{8}}}
s:= sel8{[32]u8~~b, ii32>>3 + bit{2}}
p:= s & make{[32]u8, (1<<(1+tail{2})) - 1} # Prefixes
@@ -157,12 +157,12 @@ fn bcs{T & hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = {
d+= sel8{d, bit{2}*(1+bit{3}>>2)-1}
d + sel8{d, bit{3}-1}
}
- def step{x:u32, i, store1} = {
+ def step{x:(u32), i, store1} = {
d:= sumlanes{x}
if (w==8) d+= [32]u8~~shuf{[4]u64, [8]i32~~sel8{d, bit{3}<<4-1}, 4b1100}
j:= (w/8)*i
def out{v, k} = each{out, widen{v}, 2*k+iota{2}}
- def out{v0:[vl]T, k} = {
+ def out{v0:([vl]T), k} = {
v := V~~v0 + c
# Update carry at the lane boundary
if (w!=32 or tail{1,k}) {
@@ -239,7 +239,7 @@ fn plus_scan{X, R, O}(x:*X, c:R, r:*R, len:u64) : O = {
len
}
# Sum as many vector registers as possible; modifies c and i
-def simd_plus_scan_part{X, R}{x:*X, c:(R), r:*R, len:u64, i:u64} = {
+def simd_plus_scan_part{X, R}{x:(*X), c:(R), r:(*R), len:(u64), i:(u64)} = {
def b = max{width{R}/2, width{X}}
def bulk = arch_defvw/b
diff --git a/src/singeli/src/search.singeli b/src/singeli/src/search.singeli
index 0e4d0287..bb7c22df 100644
--- a/src/singeli/src/search.singeli
+++ b/src/singeli/src/search.singeli
@@ -18,7 +18,7 @@ def findFirst{C, M, F, ...v1} = {
F{...args}
}
-def search{E, x, n:u64, OP} = {
+def search{E, x, n:(u64), OP} = {
def bulk = arch_defvw/width{E}
def VT = [bulk]E
def end = makeBranch{
@@ -111,7 +111,7 @@ def readbytes{vtab}{} = {
}
# Look up bits from table
-def bittab_lookup{x0:*void, n:u64, r0:*void, tab:*void} = {
+def bittab_lookup{x0:(*void), n:(u64), r0:(*void), tab:(*void)} = {
x:= *u8~~x0
t:= *TI~~tab
r:= *u64~~r0
@@ -124,7 +124,7 @@ def bittab_lookup{x0:*void, n:u64, r0:*void, tab:*void} = {
x+=k; rem-=k; ++r
}
}
-def bittab_lookup{x0:*void, n:u64, r0:*void, tab:*void & simd_bittab} = {
+def bittab_lookup{x0:(*void), n:(u64), r0:(*void), tab:(*void) & simd_bittab} = {
def {bitsel, _} = bittab_selector{readbytes{*VI~~tab}}
def k = vcount{VI}
@for (x in *VI~~x0, r in *ty_u{k}~~r0 over cdiv{n,k}) r = bitsel{x}
@@ -139,7 +139,7 @@ def bittab_lookup{x0:*void, n:u64, r0:*void, tab:*void & simd_bittab} = {
# - 'mask': Mark Firsts of x0
# - 'unique': Deduplicate of x0
# - 'index': First index of value x at r0+x
-def do_bittab{x0:*void, n:u64, tab:*void, u:u8, t, mode, r0} = {
+def do_bittab{x0:(*void), n:(u64), tab:(*void), u:(u8), t, mode, r0} = {
def rbit = mode == 'mask'
def rval = mode == 'unique'
def rind = mode == 'index'
diff --git a/src/singeli/src/select.singeli b/src/singeli/src/select.singeli
index 87724f94..a290baa5 100644
--- a/src/singeli/src/select.singeli
+++ b/src/singeli/src/select.singeli
@@ -9,11 +9,11 @@ include 'util/tup'
# def:T - masked original content
# b:B - pointer to data to index; if width{B}<elwidth{T}, padding bytes are garbage read after wanted position
# idx - actual (unscaled) index list
-def gather{d:T, b:B, idx:[8]i32, M & w256{T,32}} = {
+def gather{d:T, b:B, idx:([8]i32), M & w256{T,32}} = {
if (M{0}) T ~~ emit{[8]i32, '_mm256_mask_i32gather_epi32', d, *void~~b, idx, M{T,'to sign bits'}, elwidth{B}/8}
else T ~~ emit{[8]i32, '_mm256_i32gather_epi32', *void~~b, idx, elwidth{B}/8}
}
-def gather{d:T, b:B, idx:[4]i32, M & w256{T,64}} = {
+def gather{d:T, b:B, idx:([4]i32), M & w256{T,64}} = {
if (M{0}) T ~~ emit{[4]i64, '_mm256_mask_i32gather_epi64', d, *void~~b, idx, M{T,'to sign bits'}, elwidth{B}/8}
else T ~~ emit{[4]i64, '_mm256_i32gather_epi64', *void~~b, idx, elwidth{B}/8}
}
@@ -164,4 +164,4 @@ fn avx2_select_bool128(w0:*void, x0:*void, r0:*void, wl:u64, xl:u64) : u1 = {
}
export{'avx2_select_bool128', avx2_select_bool128}
-} \ No newline at end of file
+}
diff --git a/src/singeli/src/sse2.singeli b/src/singeli/src/sse2.singeli
index 2bf09a84..8e82ad46 100644
--- a/src/singeli/src/sse2.singeli
+++ b/src/singeli/src/sse2.singeli
@@ -40,8 +40,8 @@ def __mul{a:T,b:T & [4]i32==T} = {
}
# float arith
-def rsqrtE{a:[4]f32} = emit{[4]f32, '_mm_rsqrt_ps', a}
-def rcpE{a:[4]f32} = emit{[4]f32, '_mm_rcp_ps', a}
+def rsqrtE{a:([4]f32)} = emit{[4]f32, '_mm_rsqrt_ps', a}
+def rcpE{a:([4]f32)} = emit{[4]f32, '_mm_rcp_ps', a}
# mask stuff
def andAllZero{x:T, y:T & w128i{T}} = homAll{(x & y) == T**0}
@@ -94,15 +94,15 @@ def widen{T==[2]f64, x:X & w128s{X} & elwidth{X}<32} = widen{T, widen{[4]i32, x}
def widen{T==[2]f64, x:X & X==[4]i32} = emit{T, '_mm_cvtepi32_pd', x}
def widen{T==[2]f64, x:X & X==[4]f32} = emit{T, '_mm_cvtps_pd', x}
-def narrow{T==i16, x:[4]i32} = packs{x,x}
-def narrow{T==i8, x:[8]i16} = packs{x,x}
-def narrow{T==u8, x:[8]u16} = packs{x,x}
-def narrow{T==u16, x:[4]u32} = [8]u16~~shuf{[4]i32, shuf16Hi{shuf16Lo{x, 4b3320}, 4b3320}, 4b3320}
-def narrow{T==i8, x:[4]i32} = narrow{T, narrow{i16, x}}
-def narrow{T==u8, x:[4]u32} = { def f{v} = narrow{u8, [8]u16~~v}; f{f{x}}}
-def narrow{T==u8, x:[2]u64} = { def f{v} = narrow{u8, [8]u16~~v}; f{f{f{x}}}}
-def narrow{T==u16, x:[2]u64} = shuf16Lo{[8]u16~~shuf{[4]i32, x, 4b3320}, 4b3320}
-def narrow{T==u32, x:[2]u64} = [4]u32~~shuf{[4]i32, x, 4b3320}
+def narrow{T==i16, x:([4]i32)} = packs{x,x}
+def narrow{T==i8, x:([8]i16)} = packs{x,x}
+def narrow{T==u8, x:([8]u16)} = packs{x,x}
+def narrow{T==u16, x:([4]u32)} = [8]u16~~shuf{[4]i32, shuf16Hi{shuf16Lo{x, 4b3320}, 4b3320}, 4b3320}
+def narrow{T==i8, x:([4]i32)} = narrow{T, narrow{i16, x}}
+def narrow{T==u8, x:([4]u32)} = { def f{v} = narrow{u8, [8]u16~~v}; f{f{x}}}
+def narrow{T==u8, x:([2]u64)} = { def f{v} = narrow{u8, [8]u16~~v}; f{f{f{x}}}}
+def narrow{T==u16, x:([2]u64)} = shuf16Lo{[8]u16~~shuf{[4]i32, x, 4b3320}, 4b3320}
+def narrow{T==u32, x:([2]u64)} = [4]u32~~shuf{[4]i32, x, 4b3320}
def narrow{T, x:X & w128f{X,64} & T<i32} = narrow{T, narrow{i32, x}}
-def narrow{T==i32, x:[2]f64} = emit{[4]i32, '_mm_cvtpd_epi32', x}
+def narrow{T==i32, x:([2]f64)} = emit{[4]i32, '_mm_cvtpd_epi32', x}