summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarshall Lochbaum <mwlochbaum@gmail.com>2024-05-10 10:26:28 -0400
committerMarshall Lochbaum <mwlochbaum@gmail.com>2024-05-10 10:26:28 -0400
commitb2e3a5ff74d3025614ca0b12073b1a2b8ec90313 (patch)
tree0d7489051f271ea420b9d92acfaf7bbaf3cff1b4
parent4d6612cb16ceeb1a87caba83b2bafb4550526512 (diff)
Move from & to if/and for Singeli conditions
-rw-r--r--src/singeli/src/avx.singeli56
-rw-r--r--src/singeli/src/avx2.singeli122
-rw-r--r--src/singeli/src/avx512.singeli16
-rw-r--r--src/singeli/src/base.singeli172
-rw-r--r--src/singeli/src/bins.singeli14
-rw-r--r--src/singeli/src/bitops.singeli32
-rw-r--r--src/singeli/src/cbqnDefs.singeli30
-rw-r--r--src/singeli/src/clmul.singeli2
-rw-r--r--src/singeli/src/cmp.singeli24
-rw-r--r--src/singeli/src/count.singeli2
-rw-r--r--src/singeli/src/dyarith.singeli38
-rw-r--r--src/singeli/src/equal.singeli6
-rw-r--r--src/singeli/src/f64.singeli8
-rw-r--r--src/singeli/src/fold.singeli4
-rw-r--r--src/singeli/src/hashtab.singeli2
-rw-r--r--src/singeli/src/mask.singeli36
-rw-r--r--src/singeli/src/neon.singeli176
-rw-r--r--src/singeli/src/replicate.singeli10
-rw-r--r--src/singeli/src/scan.singeli22
-rw-r--r--src/singeli/src/scan_common.singeli20
-rw-r--r--src/singeli/src/search.singeli12
-rw-r--r--src/singeli/src/select.singeli6
-rw-r--r--src/singeli/src/slash.singeli60
-rw-r--r--src/singeli/src/squeeze.singeli18
-rw-r--r--src/singeli/src/sse.singeli38
-rw-r--r--src/singeli/src/sse2.singeli104
-rw-r--r--src/singeli/src/transpose.singeli12
-rw-r--r--src/singeli/src/vecfold.singeli4
28 files changed, 523 insertions, 523 deletions
diff --git a/src/singeli/src/avx.singeli b/src/singeli/src/avx.singeli
index b4b6894f..d748be9d 100644
--- a/src/singeli/src/avx.singeli
+++ b/src/singeli/src/avx.singeli
@@ -1,54 +1,54 @@
# compact casting for the annoying intrinsic type system
-def v2i{x:T & w256{T}} = if(isint{eltype{T}}) x else [32]u8 ~~ x
-def v2f{x:T & w256{T}} = [8]f32 ~~ x
-def v2d{x:T & w256{T}} = [4]f64 ~~ x
+def v2i{x:T if w256{T}} = if(isint{eltype{T}}) x else [32]u8 ~~ x
+def v2f{x:T if w256{T}} = [8]f32 ~~ x
+def v2d{x:T if w256{T}} = [4]f64 ~~ x
-def undefPromote{T, x:X & w128{X} & w256{T} & eltype{T}==eltype{X}} = T~~emit{[32]u8, '_mm256_castsi128_si256', v2i{x}}
+def undefPromote{T, x:X if w128{X} and w256{T} and eltype{T}==eltype{X}} = T~~emit{[32]u8, '_mm256_castsi128_si256', v2i{x}}
# load & store
-def loadLow{ptr:P, w & w256{eltype{P}} & w<=128} = undefPromote{eltype{P}, loadLow{*n_h{eltype{P}} ~~ ptr, w}}
-def loadLow{ptr:P, w & w256{eltype{P}} & w==256} = load{ptr}
+def loadLow{ptr:P, w if w256{eltype{P}} and w<=128} = undefPromote{eltype{P}, loadLow{*n_h{eltype{P}} ~~ ptr, w}}
+def loadLow{ptr:P, w if w256{eltype{P}} and w==256} = load{ptr}
-def storeLow{ptr:P, w, x:T & w256{T} & w<=128} = storeLow{ptr, w, half{x, 0}}
-def storeLow{ptr:P, w, x:T & w256{T} & w==256} = store{*T~~ptr, 0, x}
+def storeLow{ptr:P, w, x:T if w256{T} and w<=128} = storeLow{ptr, w, half{x, 0}}
+def storeLow{ptr:P, w, x:T if w256{T} and w==256} = store{*T~~ptr, 0, x}
# float comparison
local def f32cmpAVX{a,b,n} = [8]u32 ~~ emit{[8]f32, '_mm256_cmp_ps', a, b, n}
local def f64cmpAVX{a,b,n} = [4]u64 ~~ emit{[4]f64, '_mm256_cmp_pd', a, b, n}
-def unord{a:T,b:T & T==[8]f32} = f32cmpAVX{a,b,3}
-def unord{a:T,b:T & T==[4]f64} = f64cmpAVX{a,b,3}
+def unord{a:T,b:T if T==[8]f32} = f32cmpAVX{a,b,3}
+def unord{a:T,b:T if T==[4]f64} = f64cmpAVX{a,b,3}
# f32 arith
def rsqrtE{a:([8]f32)} = emit{[8]f32, '_mm256_rsqrt_ps', a}
def rcpE{a:([8]f32)} = emit{[8]f32, '_mm256_rcp_ps', a}
# conversion
-def half{x:T, i & w256{T} & knum{i}} = n_h{T} ~~ emit{[8]i16, '_mm256_extracti128_si256', v2i{x}, i}
-def half{x:T, i==0 & w256{T}} = n_h{T} ~~ emit{[8]i16, '_mm256_castsi256_si128', v2i{x}}
-def pair{a:T,b:T & width{T}==128} = n_d{T} ~~ emit{[8]i32, '_mm256_setr_m128i', a, b}
+def half{x:T, i if w256{T} and knum{i}} = n_h{T} ~~ emit{[8]i16, '_mm256_extracti128_si256', v2i{x}, i}
+def half{x:T, i==0 if w256{T}} = n_h{T} ~~ emit{[8]i16, '_mm256_castsi256_si128', v2i{x}}
+def pair{a:T,b:T if width{T}==128} = n_d{T} ~~ emit{[8]i32, '_mm256_setr_m128i', a, b}
-def widen{T==[4]f64, x:X & X==[4]i32} = emit{T, '_mm256_cvtepi32_pd', x}
-def widen{T==[4]f64, x:X & X==[4]f32} = emit{T, '_mm256_cvtps_pd', x}
-def widen{T==[4]f64, x:X & w128i{X} & elwidth{X}<32} = widen{T, widen{[4]i32, x}}
-def widen{T, x:X & w256{X} & vcount{X}>vcount{T}} = widen{T, half{x,0}}
+def widen{T==[4]f64, x:X if X==[4]i32} = emit{T, '_mm256_cvtepi32_pd', x}
+def widen{T==[4]f64, x:X if X==[4]f32} = emit{T, '_mm256_cvtps_pd', x}
+def widen{T==[4]f64, x:X if w128i{X} and elwidth{X}<32} = widen{T, widen{[4]i32, x}}
+def widen{T, x:X if w256{X} and vcount{X}>vcount{T}} = widen{T, half{x,0}}
# structural operations
-def topBlend{f:T, t:T, m:M & w256{T,32} & w256i{M,32}} = T ~~ emit{[8]f32, '_mm256_blendv_ps', v2f{f}, v2f{t}, v2f{m}}
-def topBlend{f:T, t:T, m:M & w256{T,64} & w256i{M,64}} = T ~~ emit{[4]f64, '_mm256_blendv_pd', v2d{f}, v2d{t}, v2d{m}}
-def homBlend{f:T, t:T, m:M & w256{T}} = topBlend{f, t, m}
+def topBlend{f:T, t:T, m:M if w256{T,32} and w256i{M,32}} = T ~~ emit{[8]f32, '_mm256_blendv_ps', v2f{f}, v2f{t}, v2f{m}}
+def topBlend{f:T, t:T, m:M if w256{T,64} and w256i{M,64}} = T ~~ emit{[4]f64, '_mm256_blendv_pd', v2d{f}, v2d{t}, v2d{m}}
+def homBlend{f:T, t:T, m:M if w256{T}} = topBlend{f, t, m}
# mask stuff
-def andAllZero{x:T, y:T & w256i{T}} = emit{u1, '_mm256_testz_si256', x, y}
+def andAllZero{x:T, y:T if w256i{T}} = emit{u1, '_mm256_testz_si256', x, y}
-def topMask{x:T & w256{T, 32}} = emit{u8, '_mm256_movemask_ps', v2f{x}}
-def topMask{x:T & w256{T, 64}} = emit{u8, '_mm256_movemask_pd', v2d{x}}
-def homMask{x:T & w256{T}} = topMask{x}
+def topMask{x:T if w256{T, 32}} = emit{u8, '_mm256_movemask_ps', v2f{x}}
+def topMask{x:T if w256{T, 64}} = emit{u8, '_mm256_movemask_pd', v2d{x}}
+def homMask{x:T if w256{T}} = topMask{x}
-def homAny{x:T & w256i{T} & elwidth{T}>=32} = homMask{[8]u32 ~~ x} != 0
-def homAll{x:T & w256i{T} & elwidth{T}>=32} = homMask{[8]u32 ~~ x} == 0xff
+def homAny{x:T if w256i{T} and elwidth{T}>=32} = homMask{[8]u32 ~~ x} != 0
+def homAll{x:T if w256i{T} and elwidth{T}>=32} = homMask{[8]u32 ~~ x} == 0xff
-def topAny{x:T & w256i{T} & elwidth{T}>=32} = topMask{x} != 0
-def topAll{x:T & w256i{T} & elwidth{T}>=32} = topMask{x} == (1<<vcount{T})-1
+def topAny{x:T if w256i{T} and elwidth{T}>=32} = topMask{x} != 0
+def topAll{x:T if w256i{T} and elwidth{T}>=32} = topMask{x} == (1<<vcount{T})-1
diff --git a/src/singeli/src/avx2.singeli b/src/singeli/src/avx2.singeli
index d50e1543..a5ed5721 100644
--- a/src/singeli/src/avx2.singeli
+++ b/src/singeli/src/avx2.singeli
@@ -1,97 +1,97 @@
# questionable pack
-def unpackQ{a:T,b:T & T==[32]i8 } = { tup{emit{[16]i16, '_mm256_unpacklo_epi8', a, b}, emit{[16]i16, '_mm256_unpackhi_epi8', a, b}}}
-def unpackQ{a:T,b:T & T==[16]i16} = { tup{emit{[ 8]i32, '_mm256_unpacklo_epi16', a, b}, emit{[ 8]i32, '_mm256_unpackhi_epi16', a, b}}}
-def unpackQ{a:T,b:T & T==[ 8]i32} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi32', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi32', a, b}}}
-def unpackQ{a:T,b:T & T==[ 4]i64} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi64', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi64', a, b}}}
+def unpackQ{a:T,b:T if T==[32]i8 } = { tup{emit{[16]i16, '_mm256_unpacklo_epi8', a, b}, emit{[16]i16, '_mm256_unpackhi_epi8', a, b}}}
+def unpackQ{a:T,b:T if T==[16]i16} = { tup{emit{[ 8]i32, '_mm256_unpacklo_epi16', a, b}, emit{[ 8]i32, '_mm256_unpackhi_epi16', a, b}}}
+def unpackQ{a:T,b:T if T==[ 8]i32} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi32', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi32', a, b}}}
+def unpackQ{a:T,b:T if T==[ 4]i64} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi64', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi64', a, b}}}
# inverse of questionable pack; these saturate the argument
-def packQ{a:T,b:T & T==[16]i16} = emit{[32]i8, '_mm256_packs_epi16', a, b}
-def packQ{a:T,b:T & T==[ 8]i32} = emit{[16]i16, '_mm256_packs_epi32', a, b}
-def packQ{a:T,b:T & T==[16]u16} = emit{[32]u8, '_mm256_packus_epi16', a, b}
-def packQ{a:T,b:T & T==[ 8]u32} = emit{[16]u16, '_mm256_packus_epi32', a, b}
+def packQ{a:T,b:T if T==[16]i16} = emit{[32]i8, '_mm256_packs_epi16', a, b}
+def packQ{a:T,b:T if T==[ 8]i32} = emit{[16]i16, '_mm256_packs_epi32', a, b}
+def packQ{a:T,b:T if T==[16]u16} = emit{[32]u8, '_mm256_packus_epi16', a, b}
+def packQ{a:T,b:T if T==[ 8]u32} = emit{[16]u16, '_mm256_packus_epi32', a, b}
# super questionable pack - assumes high halves are zero
-def packQQ{a:T,b:T & T==[4]i64} = emit{[8]i32, '_mm256_shuffle_epi32', a, 4b1120} | emit{[8]i32, '_mm256_shuffle_epi32', b, 4b2011}
+def packQQ{a:T,b:T if T==[4]i64} = emit{[8]i32, '_mm256_shuffle_epi32', a, 4b1120} | emit{[8]i32, '_mm256_shuffle_epi32', b, 4b2011}
def packQQ{{a, b}} = packQQ{a, b}
# arith
-def mulh {a:T,b:T & [16]i16==T} = emit{T, '_mm256_mulhi_epi16', a, b}
-def mulh {a:T,b:T & [16]u16==T} = emit{T, '_mm256_mulhi_epu16', a, b}
-def mul32{a:T,b:T & [ 4]i64==T} = emit{T, '_mm256_mul_epi32', a, b} # reads only low 32 bits of arguments
-def mul32{a:T,b:T & [ 4]u64==T} = emit{T, '_mm256_mul_epu32', a, b} # reads only low 32 bits of arguments
+def mulh {a:T,b:T if [16]i16==T} = emit{T, '_mm256_mulhi_epi16', a, b}
+def mulh {a:T,b:T if [16]u16==T} = emit{T, '_mm256_mulhi_epu16', a, b}
+def mul32{a:T,b:T if [ 4]i64==T} = emit{T, '_mm256_mul_epi32', a, b} # reads only low 32 bits of arguments
+def mul32{a:T,b:T if [ 4]u64==T} = emit{T, '_mm256_mul_epu32', a, b} # reads only low 32 bits of arguments
# structural operations
-def shl{S==[16]u8, x:T, n & w256{T} & knum{n}} = T ~~ emit{T, '_mm256_bslli_epi128', x, n}
-def shr{S==[16]u8, x:T, n & w256{T} & knum{n}} = T ~~ emit{T, '_mm256_bsrli_epi128', x, n}
+def shl{S==[16]u8, x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bslli_epi128', x, n}
+def shr{S==[16]u8, x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bsrli_epi128', x, n}
-def blend{L==[8]u16, a:T, b:T, m & w256{T} & knum{m}} = T ~~ emit{[16]i16, '_mm256_blend_epi16', v2i{a}, v2i{b}, m}
-def blend{L==[8]u32, a:T, b:T, m & w256{T} & knum{m}} = T ~~ emit{[ 8]i32, '_mm256_blend_epi32', v2i{a}, v2i{b}, m}
-def blend{L==[4]u64, a:T, b:T, m & w256{T} & knum{m}} = T ~~ emit{[ 4]f64, '_mm256_blend_pd', v2d{a}, v2d{b}, m}
+def blend{L==[8]u16, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[16]i16, '_mm256_blend_epi16', v2i{a}, v2i{b}, m}
+def blend{L==[8]u32, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 8]i32, '_mm256_blend_epi32', v2i{a}, v2i{b}, m}
+def blend{L==[4]u64, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 4]f64, '_mm256_blend_pd', v2d{a}, v2d{b}, m}
-def topBlend{f:T, t:T, m:M & w256{T, 8} & w256i{M, 8}} = T ~~ emit{[32]i8, '_mm256_blendv_epi8', v2i{f}, v2i{t}, v2i{m}}
-def homBlend{f:T, t:T, m:M & w256{T, 8} & w256i{M, 8}} = topBlend{f, t, m}
-def homBlend{f:T, t:T, m:M & w256{T, 16} & w256i{M,16}} = T ~~ topBlend{[32]i8~~f, [32]i8~~t, [32]i8~~m}
+def topBlend{f:T, t:T, m:M if w256{T, 8} and w256i{M, 8}} = T ~~ emit{[32]i8, '_mm256_blendv_epi8', v2i{f}, v2i{t}, v2i{m}}
+def homBlend{f:T, t:T, m:M if w256{T, 8} and w256i{M, 8}} = topBlend{f, t, m}
+def homBlend{f:T, t:T, m:M if w256{T, 16} and w256i{M,16}} = T ~~ topBlend{[32]i8~~f, [32]i8~~t, [32]i8~~m}
-def shuf{L, x:T, n & lvec{L,4,32} & w256{T} & knum{n}} = T ~~ emit{[8]i32, '_mm256_shuffle_epi32', v2i{x}, n}
-def shuf{L, x:T, n & lvec{L,4,64} & w256{T} & knum{n}} = T ~~ emit{[4]f64, '_mm256_permute4x64_pd', v2d{x}, n}
-def shufHalves{x:T, y:T, n & w256{T} & knum{n}} = T ~~ emit{[4]i64, '_mm256_permute2x128_si256', v2i{x}, v2i{y}, n}
+def shuf{L, x:T, n if lvec{L,4,32} and w256{T} and knum{n}} = T ~~ emit{[8]i32, '_mm256_shuffle_epi32', v2i{x}, n}
+def shuf{L, x:T, n if lvec{L,4,64} and w256{T} and knum{n}} = T ~~ emit{[4]f64, '_mm256_permute4x64_pd', v2d{x}, n}
+def shufHalves{x:T, y:T, n if w256{T} and knum{n}} = T ~~ emit{[4]i64, '_mm256_permute2x128_si256', v2i{x}, v2i{y}, n}
-def sel{L, x:T, i:I & w256{T} & lvec{L,8,32} & w256{I,32}} = T ~~ emit{[32]u8, '_mm256_permutevar8x32_epi32', v2i{x}, i}
-def sel{L, x:T, i:I & w256{T} & lvec{L,16,8} & w256{I, 8}} = T ~~ emit{[32]u8, '_mm256_shuffle_epi8', v2i{x}, i}
+def sel{L, x:T, i:I if w256{T} and lvec{L,8,32} and w256{I,32}} = T ~~ emit{[32]u8, '_mm256_permutevar8x32_epi32', v2i{x}, i}
+def sel{L, x:T, i:I if w256{T} and lvec{L,16,8} and w256{I, 8}} = T ~~ emit{[32]u8, '_mm256_shuffle_epi8', v2i{x}, i}
# masked store; F variants may not be a single instruction
-def topMaskStore{p:P, m:M, v:T & w256i{M, 32} & w256{T,elwidth{M}} & eltype{P}==T} = emit{void, '_mm256_maskstore_epi32', *i32~~p, m, [8]i32~~v}
-def topMaskStore{p:P, m:M, v:T & w256i{M, 64} & w256{T,elwidth{M}} & eltype{P}==T} = emit{void, '_mm256_maskstore_pd', *f64~~p, m, [4]f64~~v}
-def homMaskStore{p:P, m:M, v:T & w256i{M} & w256{T,elwidth{M}} & eltype{P}==T} = topMaskStore{p, m, v}
+def topMaskStore{p:P, m:M, v:T if w256i{M, 32} and w256{T,elwidth{M}} and eltype{P}==T} = emit{void, '_mm256_maskstore_epi32', *i32~~p, m, [8]i32~~v}
+def topMaskStore{p:P, m:M, v:T if w256i{M, 64} and w256{T,elwidth{M}} and eltype{P}==T} = emit{void, '_mm256_maskstore_pd', *f64~~p, m, [4]f64~~v}
+def homMaskStore{p:P, m:M, v:T if w256i{M} and w256{T,elwidth{M}} and eltype{P}==T} = topMaskStore{p, m, v}
-def topMaskStoreF{p:P, m:M, v:T & w256i{M} & elwidth{T}>=32} = topMaskStore{p,m,v}
-def homMaskStoreF{p:P, m:M, v:T & w256i{M} & elwidth{T}>=32} = topMaskStore{p,m,v}
-def homMaskStoreF{p:P, m:M, v:T & w256i{M} & elwidth{T}<=16 & w256{T,elwidth{M}} & eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}}
+def topMaskStoreF{p:P, m:M, v:T if w256i{M} and elwidth{T}>=32} = topMaskStore{p,m,v}
+def homMaskStoreF{p:P, m:M, v:T if w256i{M} and elwidth{T}>=32} = topMaskStore{p,m,v}
+def homMaskStoreF{p:P, m:M, v:T if w256i{M} and elwidth{T}<=16 and w256{T,elwidth{M}} and eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}}
# mask stuff
-def topMask{x:T & w256{T, 8}} = emit{u32, '_mm256_movemask_epi8', x}
-def topMask{x:T & w256{T, 16}} = {
+def topMask{x:T if w256{T, 8}} = emit{u32, '_mm256_movemask_epi8', x}
+def topMask{x:T if w256{T, 16}} = {
msk:u32 = topMask{emit{[32]u8, '_mm256_packs_epi16', x, [16]u16**0}}
(msk&255) | (msk>>8)
}
-def homAny{x:T & w256i{T}} = ~emit{u1, '_mm256_testz_si256', v2i{x}, v2i{x}}
-def homAll{x:T & w256i{T}} = homMask{[32]u8 ~~ x} == 0xffff_ffff
-def topAny{x:T & w256i{T}} = topMask{x} != 0
-def topAll{x:T & w256i{T}} = topMask{x} == (1<<vcount{T})-1
-def homMask{a:T, b:T & w256i{T,16}} = homMask{shuf{[4]u64, packQ{ty_s{a},ty_s{b}}, 4b3120}}
+def homAny{x:T if w256i{T}} = ~emit{u1, '_mm256_testz_si256', v2i{x}, v2i{x}}
+def homAll{x:T if w256i{T}} = homMask{[32]u8 ~~ x} == 0xffff_ffff
+def topAny{x:T if w256i{T}} = topMask{x} != 0
+def topAll{x:T if w256i{T}} = topMask{x} == (1<<vcount{T})-1
+def homMask{a:T, b:T if w256i{T,16}} = homMask{shuf{[4]u64, packQ{ty_s{a},ty_s{b}}, 4b3120}}
-def topAny{x:T & w256i{T,32}} = ~emit{u1, '_mm256_testz_ps', v2f{x}, v2f{x}}
-def topAny{x:T & w256i{T,64}} = ~emit{u1, '_mm256_testz_pd', v2d{x}, v2d{x}}
-def homAny{x:T & w256i{T} & elwidth{T}>=32} = topAny{x}
+def topAny{x:T if w256i{T,32}} = ~emit{u1, '_mm256_testz_ps', v2f{x}, v2f{x}}
+def topAny{x:T if w256i{T,64}} = ~emit{u1, '_mm256_testz_pd', v2d{x}, v2d{x}}
+def homAny{x:T if w256i{T} and elwidth{T}>=32} = topAny{x}
-def topAny{x:T & w256i{T,16}} = homAny{[16]i16~~x < [16]i16**0}
-def topAll{x:T & w256i{T,16}} = homAll{[16]i16~~x < [16]i16**0}
+def topAny{x:T if w256i{T,16}} = homAny{[16]i16~~x < [16]i16**0}
+def topAll{x:T if w256i{T,16}} = homAll{[16]i16~~x < [16]i16**0}
# conversion
-def widen{T==[16]u16, x:X & X==[16]u8} = emit{T, '_mm256_cvtepu8_epi16', x}; def widen{T==[16]i16, x:X & X==[16]i8} = emit{T, '_mm256_cvtepi8_epi16', x}
-def widen{T==[ 8]u32, x:X & X==[16]u8} = emit{T, '_mm256_cvtepu8_epi32', x}; def widen{T==[ 8]i32, x:X & X==[16]i8} = emit{T, '_mm256_cvtepi8_epi32', x}
-def widen{T==[ 8]u32, x:X & X==[8]u16} = emit{T, '_mm256_cvtepu16_epi32', x}; def widen{T==[ 8]i32, x:X & X==[8]i16} = emit{T, '_mm256_cvtepi16_epi32', x}
-def widen{T==[ 4]u64, x:X & X==[16]u8} = emit{T, '_mm256_cvtepu8_epi64', x}; def widen{T==[ 4]i64, x:X & X==[16]i8} = emit{T, '_mm256_cvtepi8_epi64', x}
-def widen{T==[ 4]u64, x:X & X==[8]u16} = emit{T, '_mm256_cvtepu16_epi64', x}; def widen{T==[ 4]i64, x:X & X==[8]i16} = emit{T, '_mm256_cvtepi16_epi64', x}
-def widen{T==[ 4]u64, x:X & X==[4]u32} = emit{T, '_mm256_cvtepu32_epi64', x}; def widen{T==[ 4]i64, x:X & X==[4]i32} = emit{T, '_mm256_cvtepi32_epi64', x}
-
-def narrow{T, x:X & w256i{X,32} & width{T}==8} = {
+def widen{T==[16]u16, x:X if X==[16]u8} = emit{T, '_mm256_cvtepu8_epi16', x}; def widen{T==[16]i16, x:X if X==[16]i8} = emit{T, '_mm256_cvtepi8_epi16', x}
+def widen{T==[ 8]u32, x:X if X==[16]u8} = emit{T, '_mm256_cvtepu8_epi32', x}; def widen{T==[ 8]i32, x:X if X==[16]i8} = emit{T, '_mm256_cvtepi8_epi32', x}
+def widen{T==[ 8]u32, x:X if X==[8]u16} = emit{T, '_mm256_cvtepu16_epi32', x}; def widen{T==[ 8]i32, x:X if X==[8]i16} = emit{T, '_mm256_cvtepi16_epi32', x}
+def widen{T==[ 4]u64, x:X if X==[16]u8} = emit{T, '_mm256_cvtepu8_epi64', x}; def widen{T==[ 4]i64, x:X if X==[16]i8} = emit{T, '_mm256_cvtepi8_epi64', x}
+def widen{T==[ 4]u64, x:X if X==[8]u16} = emit{T, '_mm256_cvtepu16_epi64', x}; def widen{T==[ 4]i64, x:X if X==[8]i16} = emit{T, '_mm256_cvtepi16_epi64', x}
+def widen{T==[ 4]u64, x:X if X==[4]u32} = emit{T, '_mm256_cvtepu32_epi64', x}; def widen{T==[ 4]i64, x:X if X==[4]i32} = emit{T, '_mm256_cvtepi32_epi64', x}
+
+def narrow{T, x:X if w256i{X,32} and width{T}==8} = {
a:= packQ{x, x}
b:= packQ{a, a}
re_el{T, sel{[8]u32, b, make{[8]i32, 0,4,0,4,0,4,0,4}}}
}
-def narrow{T, x:X & w256i{X,32} & width{T}==16} = re_el{T, shuf{[4]u64, packQ{x, x}, 4b3120}}
-def narrow{T, x:X & w256i{X,16} & width{T}== 8} = re_el{T, shuf{[4]u64, packQ{x, x}, 4b3120}}
+def narrow{T, x:X if w256i{X,32} and width{T}==16} = re_el{T, shuf{[4]u64, packQ{x, x}, 4b3120}}
+def narrow{T, x:X if w256i{X,16} and width{T}== 8} = re_el{T, shuf{[4]u64, packQ{x, x}, 4b3120}}
-def narrow{T, x:X & w256f{X,64} & T<i32} = narrow{T, narrow{i32, x}}
-def narrow{T, x:X & w256f{X,64} & T==i32} = emit{[4]i32, '_mm256_cvtpd_epi32', x}
+def narrow{T, x:X if w256f{X,64} and T<i32} = narrow{T, narrow{i32, x}}
+def narrow{T, x:X if w256f{X,64} and T==i32} = emit{[4]i32, '_mm256_cvtpd_epi32', x}
-def narrow{T, x:X & w256u{X,64} & T==u32} = re_el{T, sel{[8]i32, x, make{[8]i32, 2*iota{8}}}}
-def narrow{T, x:X & w256u{X,64} & T==u16} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, (iota{32}>>1<<2) | (iota{32}&1)}}}
-def narrow{T, x:X & w256u{X,64} & T== u8} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, 4*iota{32}}}}
+def narrow{T, x:X if w256u{X,64} and T==u32} = re_el{T, sel{[8]i32, x, make{[8]i32, 2*iota{8}}}}
+def narrow{T, x:X if w256u{X,64} and T==u16} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, (iota{32}>>1<<2) | (iota{32}&1)}}}
+def narrow{T, x:X if w256u{X,64} and T== u8} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, 4*iota{32}}}}
-def cvt2{T, x:X & T==i32 & X==[4]f64} = emit{[4]i32, '_mm256_cvtpd_epi32', x}
-def cvt2{T, x:X & T==f64 & X==[4]i32} = emit{[4]f64, '_mm256_cvtepi32_pd', x}
+def cvt2{T, x:X if T==i32 and X==[4]f64} = emit{[4]i32, '_mm256_cvtpd_epi32', x}
+def cvt2{T, x:X if T==f64 and X==[4]i32} = emit{[4]f64, '_mm256_cvtepi32_pd', x}
diff --git a/src/singeli/src/avx512.singeli b/src/singeli/src/avx512.singeli
index 9f7936fb..737ff41c 100644
--- a/src/singeli/src/avx512.singeli
+++ b/src/singeli/src/avx512.singeli
@@ -4,30 +4,30 @@ local {
if (isfloat{T}) (if (width{T}==32) 'ps' else 'pd')
else merge{'epi', fmtnat{width{T}}}
}
- def suf{V & isvec{V}} = suf{eltype{V}}
+ def suf{V if isvec{V}} = suf{eltype{V}}
def pref{w} = merge{'_mm', if (w==128) '' else fmtnat{w}, '_'}
- def pref{V & isvec{V}} = pref{width{V}}
+ def pref{V if isvec{V}} = pref{width{V}}
}
local def re_mask{M, sub} = {
def l = vcount{M}; def w = max{32,l}
sub{fmtnat{l}, fmtnat{w}, ty_u{w}}
}
-def reinterpret{M, a:T & ismask{M} & width{T}==width{M}} = {
+def reinterpret{M, a:T if ismask{M} and width{T}==width{M}} = {
re_mask{M, {l,w,W} => emit{M, merge{'_cvtu',w,'_mask',l}, promote{W, a}}}
}
-def reinterpret{T, a:M & ismask{M} & width{T}==width{M}} = {
+def reinterpret{T, a:M if ismask{M} and width{T}==width{M}} = {
re_mask{M, {l,w,W} => cast_i{T, emit{W, merge{'_cvtmask',l,'_u',w}, a}}}
}
-def maskStore{p:*V, m:M, v:V & ismask{M} & isvec{V} & vcount{M}==vcount{V}} = {
+def maskStore{p:*V, m:M, v:V if ismask{M} and isvec{V} and vcount{M}==vcount{V}} = {
emit{void, merge{pref{V}, 'mask_storeu_', suf{V}}, p, m, v}
}
def topMaskReg{x:V} = emit{[vcount{V}]u1, merge{pref{V},'mov',suf{V},'_mask'}, x}
-def topMask{x:T & isvec{T} & 512==width{T}} = ty_u{vcount{T}}~~topMaskReg{x}
-def homMask{x:T & isvec{T} & 512==width{T}} = topMask{x}
+def topMask{x:T if isvec{T} and 512==width{T}} = ty_u{vcount{T}}~~topMaskReg{x}
+def homMask{x:T if isvec{T} and 512==width{T}} = topMask{x}
-def maskToHom{T, x:M & ismask{M} & isvec{T} & vcount{M}==vcount{T}} = {
+def maskToHom{T, x:M if ismask{M} and isvec{T} and vcount{M}==vcount{T}} = {
emit{T, merge{pref{T},'movm_',suf{T}}, x}
}
diff --git a/src/singeli/src/base.singeli b/src/singeli/src/base.singeli
index af151f78..b8fe0710 100644
--- a/src/singeli/src/base.singeli
+++ b/src/singeli/src/base.singeli
@@ -18,38 +18,38 @@ def istup = ktup
def isunsigned{T} = isint{T} & ~issigned{T}
-def isvec {T} = 0; def isvec {T & istype{T}} = same{typekind{T},'vector'}
-def isprim{T} = 0; def isprim{T & istype{T}} = same{typekind{T},'primitive'}
-def isptr {T} = 0; def isptr {T & istype{T}} = same{typekind{T},'pointer'}
+def isvec {T} = 0; def isvec {T if istype{T}} = same{typekind{T},'vector'}
+def isprim{T} = 0; def isprim{T if istype{T}} = same{typekind{T},'primitive'}
+def isptr {T} = 0; def isptr {T if istype{T}} = same{typekind{T},'pointer'}
def elwidth{T} = width{eltype{T}}
oper &~ andnot infix none 35
-def andnot{a, b & anyNum{a} & anyNum{b}} = a & ~b
+def andnot{a, b if anyNum{a} and anyNum{b}} = a & ~b
-def load{p:P, n & isvec{eltype{P}}} = assert{0}
-def store{p:P, n, v & isvec{eltype{P}}} = assert{0}
-def load{p:P & isptr{P}} = load{p, 0}
-# def store{p:P, v & isptr{P}} = store{p, 0, v}
-def loadu{p:T & isunsigned{eltype{T}}} = emit{eltype{T}, merge{'loadu_u',fmtnat{elwidth{T}}}, p}
-def storeu{p:T, v:(eltype{T}) & isunsigned{eltype{T}}} = emit{void, merge{'storeu_u',fmtnat{elwidth{T}}}, p, v}
-def loadu{p:T & issigned{eltype{T}}} = loadu {*ty_u{eltype{T}} ~~ p}
-def storeu{p:T, v:(eltype{T}) & issigned{eltype{T}}} = storeu{*ty_u{eltype{T}} ~~ p, ty_u{v}}
-def loadu{p:T & elwidth{T}==8} = load{p}
-def storeu{p:T, v:(eltype{T}) & elwidth{T}==8} = store{p, v}
+def load{p:P, n if isvec{eltype{P}}} = assert{0}
+def store{p:P, n, v if isvec{eltype{P}}} = assert{0}
+def load{p:P if isptr{P}} = load{p, 0}
+# def store{p:P, v if isptr{P}} = store{p, 0, v}
+def loadu{p:T if isunsigned{eltype{T}}} = emit{eltype{T}, merge{'loadu_u',fmtnat{elwidth{T}}}, p}
+def storeu{p:T, v:(eltype{T}) if isunsigned{eltype{T}}} = emit{void, merge{'storeu_u',fmtnat{elwidth{T}}}, p, v}
+def loadu{p:T if issigned{eltype{T}}} = loadu {*ty_u{eltype{T}} ~~ p}
+def storeu{p:T, v:(eltype{T}) if issigned{eltype{T}}} = storeu{*ty_u{eltype{T}} ~~ p, ty_u{v}}
+def loadu{p:T if elwidth{T}==8} = load{p}
+def storeu{p:T, v:(eltype{T}) if elwidth{T}==8} = store{p, v}
-def reinterpret{T, x:X & T==X} = x
+def reinterpret{T, x:X if T==X} = x
def exportN{f, ...ns} = each{export{.,f}, ns}
def exportT{name, fs} = { v:*type{select{fs,0}} = fs; export{name, v} }
# hints
-def rare{x & knum{x}} = x
+def rare{x if knum{x}} = x
def rare{x:(u1)} = emit{u1, '__builtin_expect', x, 0}
-def assert{x & x==0} = assert{'failed assertion'}
-def assert{x & x==1} = 1
+def assert{x if x==0} = assert{'failed assertion'}
+def assert{x if x==1} = 1
def unreachable{} = emit{void, 'si_unreachable'}
def assert{x:(u1)} = { if (not x) emit{void, 'si_unreachable'} }
@@ -65,24 +65,24 @@ def anyNum{x} = isconst{x} | knum{x}
def anyNum{x:T} = isprim{T}
def anyInt{x} = 0
-def anyInt{x & knum{x}} = (x>>0) == x
-def anyInt{x & isreg{x}|isconst{x}} = isint{x}
+def anyInt{x if knum{x}} = (x>>0) == x
+def anyInt{x if isreg{x}|isconst{x}} = isint{x}
# vector width/type checks
-def w64 {T} = 0; def w64 {T & isvec{T}} = width{T}==64
-def w128{T} = 0; def w128{T & isvec{T}} = width{T}==128
-def w256{T} = 0; def w256{T & isvec{T}} = width{T}==256
-def w64 {T,w} = 0; def w64 {T,w & w64{T}} = elwidth{T}==w
-def w128{T,w} = 0; def w128{T,w & w128{T}} = elwidth{T}==w
-def w256{T,w} = 0; def w256{T,w & w256{T}} = elwidth{T}==w
+def w64 {T} = 0; def w64 {T if isvec{T}} = width{T}==64
+def w128{T} = 0; def w128{T if isvec{T}} = width{T}==128
+def w256{T} = 0; def w256{T if isvec{T}} = width{T}==256
+def w64 {T,w} = 0; def w64 {T,w if w64{T}} = elwidth{T}==w
+def w128{T,w} = 0; def w128{T,w if w128{T}} = elwidth{T}==w
+def w256{T,w} = 0; def w256{T,w if w256{T}} = elwidth{T}==w
# width+type checks
def genchk{B, F} = {
def r{T} = 0
- def r{T & B{T}} = F{eltype{T}}
+ def r{T if B{T}} = F{eltype{T}}
def r{T,w} = 0
- def r{T,w & B{T}} = F{eltype{T}} & (elwidth{T}==w)
- def r{T & ~isvec{T}} = 0
+ def r{T,w if B{T}} = F{eltype{T}} & (elwidth{T}==w)
+ def r{T if ~isvec{T}} = 0
r
}
def w256i = genchk{w256, isint}; def w128i = genchk{w128, isint}; def w64i = genchk{w64, isint}
@@ -92,14 +92,14 @@ def w256f = genchk{w256, isfloat}; def w128f = genchk{w128, isfloat}; de
-def trunc{T, x:U & isint{T} & isint{U} & T<=U} = emit{T, '', x}
-def trunc{T, x & knum{x}} = cast{T, x}
+def trunc{T, x:U if isint{T} and isint{U} and T<=U} = emit{T, '', x}
+def trunc{T, x if knum{x}} = cast{T, x}
-def tern{c, T, F & anyInt{c}} = {
+def tern{c, T, F if anyInt{c}} = {
if(c) T
else F
}
-def tern{c, t:T, f:T & anyInt{c}} = {
+def tern{c, t:T, f:T if anyInt{c}} = {
res:T = f
if (c) res = t
res
@@ -111,42 +111,42 @@ def re_el{E, V} = [width{V}/width{E}]E
def re_el{E, x:V} = re_el{E,V} ~~ x
local def qualChange{q} = {
- def f{w & knum{w}} = primtype{q, w}
- def f{T & isprim{T}} = primtype{q, width{T}}
- def f{T & isvec{T}} = re_el{f{eltype{T}}, T}
+ def f{w if knum{w}} = primtype{q, w}
+ def f{T if isprim{T}} = primtype{q, width{T}}
+ def f{T if isvec{T}} = re_el{f{eltype{T}}, T}
def f{x:T} = f{T}~~x
}
def ty_u = qualChange{'u'}
def ty_s = qualChange{'i'}
def ty_f = qualChange{'f'}
-def w_n{T, w & isprim{T}} = primtype{quality{T}, w}
-def w_d{T & isprim{T}} = to_w{T, width{T}*2} # double/halve primitive type width
-def w_h{T & isprim{T}} = to_w{T, width{T}/2}
+def w_n{T, w if isprim{T}} = primtype{quality{T}, w}
+def w_d{T if isprim{T}} = to_w{T, width{T}*2} # double/halve primitive type width
+def w_h{T if isprim{T}} = to_w{T, width{T}/2}
-def n_d{T & isvec{T}} = [vcount{T}*2](eltype{T}) # double/halve vector count
-def n_h{T & isvec{T}} = [vcount{T}/2](eltype{T})
+def n_d{T if isvec{T}} = [vcount{T}*2](eltype{T}) # double/halve vector count
+def n_h{T if isvec{T}} = [vcount{T}/2](eltype{T})
-def el_d{T & isvec{T}} = [vcount{T}](w_d{eltype{T}}) # double/halve element width, preserving count
-def el_h{T & isvec{T}} = [vcount{T}](w_h{eltype{T}})
+def el_d{T if isvec{T}} = [vcount{T}](w_d{eltype{T}}) # double/halve element width, preserving count
+def el_h{T if isvec{T}} = [vcount{T}](w_h{eltype{T}})
-def el_m{T & isvec{T}} = re_el{w_d{eltype{T}}, T}; def el_m{x:T} = re_el{T}~~x # double/halve element width, preserving width
-def el_s{T & isvec{T}} = re_el{w_h{eltype{T}}, T}; def el_s{x:T} = re_el{T}~~x
+def el_m{T if isvec{T}} = re_el{w_d{eltype{T}}, T}; def el_m{x:T} = re_el{T}~~x # double/halve element width, preserving width
+def el_s{T if isvec{T}} = re_el{w_h{eltype{T}}, T}; def el_s{x:T} = re_el{T}~~x
# type stats
-def minvalue{T & isunsigned{T}} = 0
-def maxvalue{T & isunsigned{T}} = (1<<width{T})-1
-def minvalue{T & issigned{T}} = - (1<<(width{T}-1))
-def maxvalue{T & issigned{T}} = (1<<(width{T}-1))-1
+def minvalue{T if isunsigned{T}} = 0
+def maxvalue{T if isunsigned{T}} = (1<<width{T})-1
+def minvalue{T if issigned{T}} = - (1<<(width{T}-1))
+def maxvalue{T if issigned{T}} = (1<<(width{T}-1))-1
# tuple operations
-def iota{n & knum{n}} = range{n}
-def broadcast{T, v & isprim{T}} = v
-def broadcast{n, v & knum{n}} = each{{_}=>v, range{n}}
-def collect{vars,begin,end,iter & knum{begin} & knum{end}} = {
+def iota{n if knum{n}} = range{n}
+def broadcast{T, v if isprim{T}} = v
+def broadcast{n, v if knum{n}} = each{{_}=>v, range{n}}
+def collect{vars,begin,end,iter if knum{begin} and knum{end}} = {
each{iter{., vars}, range{end-begin}+begin}
}
@@ -162,7 +162,7 @@ def fast_BMI2{} = if (SLOW_PDEP) 0 else hasarch{'BMI2'}
# test if vector has a specific width & element type
def lvec{T, n, w} = 0
-def lvec{T, n, w & isvec{T} & vcount{T}==n & elwidth{T}==w} = 1
+def lvec{T, n, w if isvec{T} and vcount{T}==n and elwidth{T}==w} = 1
# base cases
def {
@@ -176,7 +176,7 @@ def {
def homMaskX{a:T} = tup{1, homMask{a}} # tup{n,mask}; mask with each bit repeated n times
def ctzX{{n,v}} = ctz{v}/n # ctz for a result of homMaskX
-def homMask{...vs & length{vs}>1} = {
+def homMask{...vs if length{vs}>1} = {
def n = length{vs}
def T = oneType{vs}
def RT = ty_u{max{8,vcount{T}*n}}
@@ -204,15 +204,15 @@ def mzipHi{a:T, b:T} = el_m{T} ~~ zipHi{a, b}
def packQ{{a, b}} = packQ{a, b}
def pair{{a, b}} = pair{a, b}
-def widen{T, x:X & T==X} = x
-def narrow{T, x:X & T==eltype{X}} = x
-def undefPromote{T, x:X & T==X} = x
-def cvt{T, x:X & T==eltype{X}} = x
+def widen{T, x:X if T==X} = x
+def narrow{T, x:X if T==eltype{X}} = x
+def undefPromote{T, x:X if T==X} = x
+def cvt{T, x:X if T==eltype{X}} = x
-def broadcast{T, v & isvec{T}} = vec_broadcast{T, promote{eltype{T},v}}
-def make{T, ...xs & isvec{T}} = vec_make{T, ...xs}
-def iota{T & isvec{T}} = make{T, ...iota{vcount{T}}}
-def absu{a:T & isvec{T}} = ty_u{abs{a}}
+def broadcast{T, v if isvec{T}} = vec_broadcast{T, promote{eltype{T},v}}
+def make{T, ...xs if isvec{T}} = vec_make{T, ...xs}
+def iota{T if isvec{T}} = make{T, ...iota{vcount{T}}}
+def absu{a:T if isvec{T}} = ty_u{abs{a}}
def floor = __floor
def ceil = __ceil
@@ -226,29 +226,29 @@ def sqrt = __sqrt
# more arith
-def min{a, b & anyNum{a} & anyNum{b}} = tern{a<b, a, b}
-def max{a, b & anyNum{a} & anyNum{b}} = tern{a>b, a, b}
+def min{a, b if anyNum{a} and anyNum{b}} = tern{a<b, a, b}
+def max{a, b if anyNum{a} and anyNum{b}} = tern{a>b, a, b}
def cdiv{a,b} = (a+b-1)/b # ceiling divide
-def cdiv{a,b & knum{a} & knum{b}} = ceil{a/b}
-def popc{x:T & isint{T} & width{T}==64} = emit{ux, '__builtin_popcountll', x}
-def popc{x:T & isint{T} & width{T}<=32} = emit{ux, '__builtin_popcount', x}
-def ctz{x:T & isint{T} & width{T}==64} = emit{ux, '__builtin_ctzll', x}
-def ctz{x:T & isint{T} & width{T}<=32} = emit{ux, '__builtin_ctz', x}
-def clz{x:T & isint{T} & width{T}==64} = emit{ux, '__builtin_clzll', x}
-def clz{x:T & isint{T} & width{T}<=32} = emit{ux, '__builtin_clz', x}
+def cdiv{a,b if knum{a} and knum{b}} = ceil{a/b}
+def popc{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_popcountll', x}
+def popc{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_popcount', x}
+def ctz{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_ctzll', x}
+def ctz{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_ctz', x}
+def clz{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_clzll', x}
+def clz{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_clz', x}
# count-leading-zeros complement, less type-dependent
-def clzc{x:T & isint{T} & width{T}==64} = 64-clz{x}
-def clzc{x:T & isint{T} & width{T}<=32} = 32-clz{x}
+def clzc{x:T if isint{T} and width{T}==64} = 64-clz{x}
+def clzc{x:T if isint{T} and width{T}<=32} = 32-clz{x}
def ceil_log2{n} = clzc{n-1}
-def truncBits{n, v & n<=8} = cast_i{u8, v}
-def truncBits{n, v & n==16} = cast_i{u16, v}
-def truncBits{n, v & n==32} = cast_i{u32, v}
-def truncBits{n, v & n==64} = cast_i{u64, v}
+def truncBits{n, v if n<=8} = cast_i{u8, v}
+def truncBits{n, v if n==16} = cast_i{u16, v}
+def truncBits{n, v if n==32} = cast_i{u32, v}
+def truncBits{n, v if n==64} = cast_i{u64, v}
# base-2 log of a constant power of two
-def lb{n & knum{n} & (n>>1<<1) == n & n>0} = lb{n>>1}+1
+def lb{n if knum{n} and (n>>1<<1) == n and n>0} = lb{n>>1}+1
def lb{n==1} = 0
def zlow{n,x} = (x >> n) << n # zero out n least significant bits
@@ -256,11 +256,11 @@ def tail{n,x} = x & ((1<<n) - 1) # get the n least significant bits
def bit {k,x} = x & (1<<k) # get the k-th bit
# range logic
-def inRangeLen{x:TS, start, count & issigned{eltype{TS}}} = { # ∊ [start;start+count)
+def inRangeLen{x:TS, start, count if issigned{eltype{TS}}} = { # ∊ [start;start+count)
def TU = ty_u{TS}
(TU~~(x-TS**start)) < TU**count
}
-def inRangeLen{x:TU, start, count & isunsigned{eltype{TU}}} = { # ∊ [start;start+count)
+def inRangeLen{x:TU, start, count if isunsigned{eltype{TU}}} = { # ∊ [start;start+count)
def TS = ty_s{TU}
def h = 1 << (elwidth{TU}-1)
(TS~~(x-TU**(start-h))) < TS**(count-h)
@@ -270,8 +270,8 @@ def inRangeExcl{x:T, start, end} = inRangeLen{x, start, end-start} # ∊ [start;
-def load{p,i & kgen{p}} = p{i}
-def store{p,i,x & kgen{p}} = p{i,x}
+def load{p,i if kgen{p}} = p{i}
+def store{p,i,x if kgen{p}} = p{i,x}
def tptr{l,s} = { # create "pointer" generator with given load & store definitions
def r{i} = l{i}
def r{i,x} = s{i,x}
@@ -329,7 +329,7 @@ def eachx{F, ...args} = {
each{F, ...each{{x} => if (istup{x}) x else l**x, args}}
}
-def undef{T, n & istype{T}} = @collect(n) undef{T}
-def undef{Ts & istup{Ts}} = each{undef, Ts}
+def undef{T, n if istype{T}} = @collect(n) undef{T}
+def undef{Ts if istup{Ts}} = each{undef, Ts}
def undef{x:T} = undef{T}
-def undef{T & istype{T}} = { reg:=undefined{T} }
+def undef{T if istype{T}} = { reg:=undefined{T} }
diff --git a/src/singeli/src/bins.singeli b/src/singeli/src/bins.singeli
index 08459bce..128ab8bb 100644
--- a/src/singeli/src/bins.singeli
+++ b/src/singeli/src/bins.singeli
@@ -41,20 +41,20 @@ fn max_scan{T, up}(x:*T, len:u64) : void = {
def getsel{...x} = assert{'shuffling not supported', show{...x}}
if_inline (hasarch{'AVX2'}) {
- def getsel{h:H & lvec{H, 16, 8}} = {
+ def getsel{h:H if lvec{H, 16, 8}} = {
sel{H, pair{h,h}, .}
}
- def getsel{v:V & lvec{V, 32, 8}} = {
+ def getsel{v:V if lvec{V, 32, 8}} = {
def H = n_h{V}
vtop := V**(vcount{V}/2)
hs := each{shuf{[4]u64, v, .}, tup{4b3232, 4b1010}}
{i} => homBlend{...each{sel{H,.,i}, hs}, V~~i<vtop}
}
- def getsel{v:V & lvec{V, 8, 32}} = sel{V, v, .}
+ def getsel{v:V if lvec{V, 8, 32}} = sel{V, v, .}
}
# Move evens to half 0 and odds to half 1
-def uninterleave{x:V & hasarch{'AVX2'}} = {
+def uninterleave{x:V if hasarch{'AVX2'}} = {
def vl = vcount{V}; def bytes = width{eltype{V}}/8
def i = 2*iota{vl/4}
def i2= join{table{+, bytes*merge{i,i+1}, iota{bytes}}}
@@ -108,7 +108,7 @@ fn write_indices{I,T}(t:*I, w:*T, n:u64) : void = {
}
setlabel{break}
}
-fn write_indices{I,T & width{I}==8}(t:*I, w:*T, n:u64) : void = {
+fn write_indices{I,T if width{I}==8}(t:*I, w:*T, n:u64) : void = {
@for (w over j to n) store{t, w, cast_i{I, j+1}}
}
def bins_lookup{I, T, up, w:*T, wn:(u64), x:*T, xn:(u64), rp:(*void)} = {
@@ -129,7 +129,7 @@ def bins_lookup{I, T, up, w:*T, wn:(u64), x:*T, xn:(u64), rp:(*void)} = {
tfree{t0}
}
-def bins_vectab_i8{up, w, wn, x, xn, rp, t0, t, done & hasarch{'AVX2'}} = {
+def bins_vectab_i8{up, w, wn, x, xn, rp, t0, t, done if hasarch{'AVX2'}} = {
assert{wn < 128} # Total must fit in i8
def vl = 32
def T = i8
@@ -200,7 +200,7 @@ def bins_vectab_i8{up, w, wn, x, xn, rp, t0, t, done & hasarch{'AVX2'}} = {
}
# Binary search within vector registers
-def bin_search_vec{prim, T, w:*T, wn, x:*T, xn, rp, maxwn & hasarch{'AVX2'}} = {
+def bin_search_vec{prim, T, w:*T, wn, x:*T, xn, rp, maxwn if hasarch{'AVX2'}} = {
def up = prim != '⍒'
def search = (prim=='∊') | (prim=='⊐')
assert{wn > 1}; assert{wn < maxwn}
diff --git a/src/singeli/src/bitops.singeli b/src/singeli/src/bitops.singeli
index 5dcf9c62..6d56a11f 100644
--- a/src/singeli/src/bitops.singeli
+++ b/src/singeli/src/bitops.singeli
@@ -4,13 +4,13 @@ def b_get{x:(*u64), n:(ux)} = {
((load{x,n>>6}>>(n&63)) & 1) != 0
}
-def b_getBatchLo{sz, x:(*u64), n:(ux) & sz==2} = (load{*u8~~x, n>>2} >> cast_i{u8, (n&3)*2})
-def b_getBatchLo{sz, x:(*u64), n:(ux) & sz==4} = (load{*u8~~x, n>>1} >> cast_i{u8, (n&1)*4})
-def b_getBatchLo{sz, x:(*u64), n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n}
+def b_getBatchLo{sz, x:(*u64), n:(ux) if sz==2} = (load{*u8~~x, n>>2} >> cast_i{u8, (n&3)*2})
+def b_getBatchLo{sz, x:(*u64), n:(ux) if sz==4} = (load{*u8~~x, n>>1} >> cast_i{u8, (n&1)*4})
+def b_getBatchLo{sz, x:(*u64), n:(ux) if sz>=8} = load{*ty_u{sz}~~x, n}
-def b_getBatch{sz, x:(*u64), n:(ux) & sz==2} = b_getBatchLo{sz, x, n} & 3
-def b_getBatch{sz, x:(*u64), n:(ux) & sz==4} = b_getBatchLo{sz, x, n} & 15
-def b_getBatch{sz, x:(*u64), n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n}
+def b_getBatch{sz, x:(*u64), n:(ux) if sz==2} = b_getBatchLo{sz, x, n} & 3
+def b_getBatch{sz, x:(*u64), n:(ux) if sz==4} = b_getBatchLo{sz, x, n} & 15
+def b_getBatch{sz, x:(*u64), n:(ux) if sz>=8} = load{*ty_u{sz}~~x, n}
def b_set{x:(*u64), n:(ux), v:(u1)} = {
@@ -30,7 +30,7 @@ def b_setBatch{sz, x:(*u64), n:(ux), v} = {
store{x, n/am, w}
}
-def b_setBatch{sz, x:(*u64), n:(ux), v & sz==4} = {
+def b_setBatch{sz, x:(*u64), n:(ux), v if sz==4} = {
x8:= *u8 ~~ x
#w:u64 = cast_i{u64, load{x8,n/2}}
@@ -49,10 +49,10 @@ def b_setBatch{sz, x:(*u64), n:(ux), v & sz==4} = {
store{x8, n/2, cast_i{u8,w}}
}
-def b_setBatch{sz, x:(*u64), n:(ux), v & sz== 8} = store{*u8 ~~ x, n, cast_i{u8, v}}
-def b_setBatch{sz, x:(*u64), n:(ux), v & sz==16} = store{*u16 ~~ x, n, cast_i{u16,v}}
-def b_setBatch{sz, x:(*u64), n:(ux), v & sz==32} = store{*u32 ~~ x, n, cast_i{u32,v}}
-def b_setBatch{sz, x:(*u64), n:(ux), v & sz==64} = store{ x, n, cast_i{u64,v}}
+def b_setBatch{sz, x:(*u64), n:(ux), v if sz== 8} = store{*u8 ~~ x, n, cast_i{u8, v}}
+def b_setBatch{sz, x:(*u64), n:(ux), v if sz==16} = store{*u16 ~~ x, n, cast_i{u16,v}}
+def b_setBatch{sz, x:(*u64), n:(ux), v if sz==32} = store{*u32 ~~ x, n, cast_i{u32,v}}
+def b_setBatch{sz, x:(*u64), n:(ux), v if sz==64} = store{ x, n, cast_i{u64,v}}
def spreadBits{T==[32]u8, a:(u32)} = {
def idxs = iota{32}
@@ -63,17 +63,17 @@ def spreadBits{T==[32]u8, a:(u32)} = {
e == (d&e)
}
-def spreadBits{T==[16]u8, a:(u16) & hasarch{'AARCH64'}} = {
+def spreadBits{T==[16]u8, a:(u16) if hasarch{'AARCH64'}} = {
b:= sel{[16]u8, [16]u8~~[8]u16**a, make{[16]i8, iota{16}>=8}}
andnz{b, make{[16]u8, 1<<(iota{16}&7)}}
}
-def spreadBits{T==[16]u8, a:(u16) & hasarch{'X86_64'}} = {
+def spreadBits{T==[16]u8, a:(u16) if hasarch{'X86_64'}} = {
b:= [16]u8~~[8]u16**a
exp:= [16]u8~~shuf{[4]i32, shuf16Lo{mzipLo{b, b}, 4b1100}, 4b1100}
(exp & make{[16]u8, 1<<(iota{16}&7)}) != [16]u8**0
}
-def spreadBits{T, a & vcount{T} <= elwidth{T} & quality{eltype{T}}=='u'} = {
+def spreadBits{T, a if vcount{T} <= elwidth{T} and quality{eltype{T}}=='u'} = {
b:= make{T, 1<<iota{vcount{T}}}
b == (b & T ~~ re_el{type{a}, T}**a) # not just T**a so that if a is read from RAM, it can use the single instruction for broadcasting from RAM; the extra bits don't matter
}
@@ -90,10 +90,10 @@ def loaduBit{x:(*u64), i, n} = {
assert{(n<58) | (((n==58) | (n==60)) & (i%n == 0))}
loaduBitRaw{x, i}
}
-def loaduBitTrunc{x:(*u64), i, n & knum{n}} = truncBits{n, loaduBit{x, i, n}}
+def loaduBitTrunc{x:(*u64), i, n if knum{n}} = truncBits{n, loaduBit{x, i, n}}
-def loadBatchBit{T, x:(*u64), is & ktup{is}} = {
+def loadBatchBit{T, x:(*u64), is if ktup{is}} = {
# def len = length{is}
# def count = vcount{T}
# assert{count*len <= 64}
diff --git a/src/singeli/src/cbqnDefs.singeli b/src/singeli/src/cbqnDefs.singeli
index 8aaf8026..cac96e91 100644
--- a/src/singeli/src/cbqnDefs.singeli
+++ b/src/singeli/src/cbqnDefs.singeli
@@ -1,13 +1,13 @@
def bcall{T, f, x} = emit{T, 'BCALL', f, x}
-def from_B{T, x & T==f64} = bcall{T, 'o2fG', x}
-def from_B{T, x & T<=i32 & issigned{T}} = bcall{T, 'o2iG', x}
-def from_B{T, x & T<=u32 & isunsigned{T}} = bcall{T, 'o2cG', x}
+def from_B{T, x if T==f64} = bcall{T, 'o2fG', x}
+def from_B{T, x if T<=i32 and issigned{T}} = bcall{T, 'o2iG', x}
+def from_B{T, x if T<=u32 and isunsigned{T}} = bcall{T, 'o2cG', x}
def q_f64{x} = bcall{u1, 'q_f64', x}
def q_chr{x} = bcall{u1, 'q_c32', x}
-def q_chr{T,x & T==u8 } = bcall{u1, 'q_c8', x}
-def q_chr{T,x & T==u16} = bcall{u1, 'q_c16', x}
-def q_chr{T,x & T==u32} = bcall{u1, 'q_c32', x}
+def q_chr{T,x if T==u8 } = bcall{u1, 'q_c8', x}
+def q_chr{T,x if T==u16} = bcall{u1, 'q_c16', x}
+def q_chr{T,x if T==u32} = bcall{u1, 'q_c32', x}
def cbqn_c32Tag{} = emit{u64, '', 'C32_TAG'}
def cbqn_tagTag{} = emit{u64, '', 'TAG_TAG'}
@@ -21,18 +21,18 @@ def cbqn_nspTag{} = emit{u64, '', 'NSP_TAG'}
def cbqn_objTag{} = emit{u64, '', 'OBJ_TAG'}
def cbqn_arrTag{} = emit{u64, '', 'ARR_TAG'}
-def cbqn_elType{T & T==u1 } = 0
-def cbqn_elType{T & T==i8 } = 1
-def cbqn_elType{T & T==i16} = 2
-def cbqn_elType{T & T==i32} = 3
-def cbqn_elType{T & T==f64} = 4
-def cbqn_elType{T & T==u8 } = 5
-def cbqn_elType{T & T==u16} = 6
-def cbqn_elType{T & T==u32} = 7
+def cbqn_elType{T if T==u1 } = 0
+def cbqn_elType{T if T==i8 } = 1
+def cbqn_elType{T if T==i16} = 2
+def cbqn_elType{T if T==i32} = 3
+def cbqn_elType{T if T==f64} = 4
+def cbqn_elType{T if T==u8 } = 5
+def cbqn_elType{T if T==u16} = 6
+def cbqn_elType{T if T==u32} = 7
def cbqn_tyArrOffset{} = emit{u64, 'offsetof', 'TyArr', 'a'}
def talloc{T, len} = emit{*T, 'TALLOCP', fmt_type{T}, len}
def tfree{ptr} = emit{void, 'TFREE', ptr}
def fmt_type{T} = merge{quality{T}, fmtnat{width{T}}}
-def fmt_type{T & isptr{T}} = merge{'*',fmt_type{eltype{T}}}
+def fmt_type{T if isptr{T}} = merge{'*',fmt_type{eltype{T}}}
diff --git a/src/singeli/src/clmul.singeli b/src/singeli/src/clmul.singeli
index ea5f9123..c87bf1db 100644
--- a/src/singeli/src/clmul.singeli
+++ b/src/singeli/src/clmul.singeli
@@ -1,2 +1,2 @@
-def clmul{a:T, b:T, imm & w128i{T}} = emit{T, '_mm_clmulepi64_si128', a, b, imm}
+def clmul{a:T, b:T, imm if w128i{T}} = emit{T, '_mm_clmulepi64_si128', a, b, imm}
def clmul{a, b} = clmul{a, b, 0}
diff --git a/src/singeli/src/cmp.singeli b/src/singeli/src/cmp.singeli
index e47db6ca..04d673d6 100644
--- a/src/singeli/src/cmp.singeli
+++ b/src/singeli/src/cmp.singeli
@@ -16,13 +16,13 @@ fn cmpIX(dst:(*u64), len:ux, x:(u64), v:(u1)) : void = {
def eqne{op} = same{op,__eq}|same{op,__ne}
-def pathAS{dst, len, T, op, x & issigned{T}} = {
- def R{f & eqne{op}} = {
+def pathAS{dst, len, T, op, x if issigned{T}} = {
+ def R{f if eqne{op}} = {
if (rare{floor{f}!=f}) fillbits{dst, len, op{0,1}, x} # also includes check for NaN/sNaN
ftrunc{i64,f}
}
- def R{f & same{op,__lt}|same{op,__ge}} = ftrunc{i64,ceil{f}}
- def R{f & same{op,__gt}|same{op,__le}} = ftrunc{i64,floor{f}}
+ def R{f if same{op,__lt} or same{op,__ge}} = ftrunc{i64,ceil{f}}
+ def R{f if same{op,__gt} or same{op,__le}} = ftrunc{i64,floor{f}}
xf:f64 = interp_f64{x}
xi64:i64 = R{xf}
@@ -40,7 +40,7 @@ def pathAS{dst, len, T, op, x & issigned{T}} = {
xT
}
-def pathAS{dst, len, T, op, x & T==f64} = {
+def pathAS{dst, len, T, op, x if T==f64} = {
if (rare{~q_f64{x}}) {
if (~eqne{op}) if (~q_chr{x}) cmp_err{x}
fillbits{dst, len, op{0,1}, x}
@@ -48,7 +48,7 @@ def pathAS{dst, len, T, op, x & T==f64} = {
from_B{T,x}
}
-def pathAS{dst, len, T, op, x & isunsigned{T}} = {
+def pathAS{dst, len, T, op, x if isunsigned{T}} = {
if (rare{~q_chr{x}}) {
if (~eqne{op}) if (~q_f64{x}) cmp_err{x}
fillbits{dst, len, op{1,0}, x}
@@ -66,12 +66,12 @@ def any2bit{VT, unr, op0, wS, wV, xS, xV, dst:(*u64), len:(ux)} = {
def T = eltype{VT}
def op = match (op0) {
- {_ & ~hasarch{'X86_64'} | hasarch{'AVX512F'}} => op0
- {(__le) & issigned{T}} => __gt
- {(__ge) & issigned{T}} => __lt
- {(__lt) & isunsigned{T}} => __ge
- {(__gt) & isunsigned{T}} => __le
- {(__ne) & isint{T}} => __eq
+ {_ if not hasarch{'X86_64'} or hasarch{'AVX512F'}} => op0
+ {(__le) if issigned{T}} => __gt
+ {(__ge) if issigned{T}} => __lt
+ {(__lt) if isunsigned{T}} => __ge
+ {(__gt) if isunsigned{T}} => __le
+ {(__ne) if isint{T}} => __eq
{_} => op0
}
def mask = if (same{op0, op}) homMask else ({...x} => ~homMask{...x})
diff --git a/src/singeli/src/count.singeli b/src/singeli/src/count.singeli
index 1a8ed9ad..624de946 100644
--- a/src/singeli/src/count.singeli
+++ b/src/singeli/src/count.singeli
@@ -4,7 +4,7 @@ include './vecfold'
if_inline (hasarch{'SSE2'}) {
fn sum_vec{T}(v:T) = vfold{+, fold{+, unpackQ{v, T**0}}}
- def fold_addw{v:T & eltype{T}==i8} = sum_vec{T}(v)
+ def fold_addw{v:T if eltype{T}==i8} = sum_vec{T}(v)
}
def inc{ptr, ind, v} = store{ptr, ind, v + load{ptr, ind}}
diff --git a/src/singeli/src/dyarith.singeli b/src/singeli/src/dyarith.singeli
index e61074b7..768b81c4 100644
--- a/src/singeli/src/dyarith.singeli
+++ b/src/singeli/src/dyarith.singeli
@@ -6,15 +6,15 @@ include './mask'
include 'util/tup'
-def rootty{T & isprim{T}} = T
-def rootty{T & isvec{T}} = eltype{T}
+def rootty{T if isprim{T}} = T
+def rootty{T if isvec{T}} = eltype{T}
def is_s{X} = issigned{rootty{X}}
def is_u{X} = isunsigned{rootty{X}}
def ty_sc{O, R} = R # keep floats as-is
-def ty_sc{O, R & is_s{O} & is_u{R}} = ty_s{R}
-def ty_sc{O, R & is_u{O} & is_s{R}} = ty_u{R}
+def ty_sc{O, R if is_s{O} and is_u{R}} = ty_s{R}
+def ty_sc{O, R if is_u{O} and is_s{R}} = ty_u{R}
def bqn_or{a, b} = (a+b)-(a*b)
@@ -22,18 +22,18 @@ def bqn_or{a, b} = (a+b)-(a*b)
# + & -
def arithChk1{F==__add, M, w:T, x:T, r:T} = tup{'topAny', M{(w^r) & (x^r)}}
def arithChk1{F==__sub, M, w:T, x:T, r:T} = tup{'topAny', M{(w^x) & (w^r)}}
-def arithChk1{F==__add, M, w:T, x:T, r:T & isvec{T} & tern{hasarch{'X86_64'}, elwidth{T}<=16, 1}} = tup{'anyne', adds{w,x}, r}
-def arithChk1{F==__sub, M, w:T, x:T, r:T & isvec{T} & tern{hasarch{'X86_64'}, elwidth{T}<=16, 1}} = tup{'anyne', subs{w,x}, r}
+def arithChk1{F==__add, M, w:T, x:T, r:T if isvec{T} and tern{hasarch{'X86_64'}, elwidth{T}<=16, 1}} = tup{'anyne', adds{w,x}, r}
+def arithChk1{F==__sub, M, w:T, x:T, r:T if isvec{T} and tern{hasarch{'X86_64'}, elwidth{T}<=16, 1}} = tup{'anyne', subs{w,x}, r}
-def arithChk2{F, M, w:T, x:T & is_s{T} & (same{F,__add} | same{F,__sub})} = {
+def arithChk2{F, M, w:T, x:T if is_s{T} and (same{F,__add} or same{F,__sub})} = {
r:= F{w,x}
tup{r, arithChk1{F, M, w, x, r}}
}
# ×
-def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & i8==eltype{T} & hasarch{'X86_64'}} = {
+def arithChk2{F, M, w:T, x:T if same{F,__mul} and isvec{T} and i8==eltype{T} and hasarch{'X86_64'}} = {
def wp = unpackQ{w, T ~~ (T**0 > w)}
def xp = unpackQ{x, T ~~ (T**0 > x)}
def rp = each{__mul, wp, xp}
@@ -46,12 +46,12 @@ def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & i8==eltype{T} & hasarc
tup{packQ{rp}, tup{'~andAllZero', RU~~tree_fold{|, bad}, RU**0xff80}}
}
}
-def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & i16==eltype{T} & hasarch{'X86_64'}} = {
+def arithChk2{F, M, w:T, x:T if same{F,__mul} and isvec{T} and i16==eltype{T} and hasarch{'X86_64'}} = {
rl:= __mul{w,x}
rh:= mulh{w,x}
tup{rl, tup{'anyne', rh, rl>>15}}
}
-def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & i32==eltype{T} & hasarch{'X86_64'}} = {
+def arithChk2{F, M, w:T, x:T if same{F,__mul} and isvec{T} and i32==eltype{T} and hasarch{'X86_64'}} = {
max:= re_el{f32, (ty_u{T})**0x4efffffe}
def cf32{x:X} = emit{re_el{f32,X}, tern{T==[8]i32, '_mm256_cvtepi32_ps', '_mm_cvtepi32_ps'}, x}
f32mul:= cf32{w} * cf32{x}
@@ -68,7 +68,7 @@ def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & i32==eltype{T} & hasar
# tup{packQQ{each{{v} => v & T2**0xFFFFFFFF, rp}}, tup{'homAny', tree_fold{|,bad}}} this doesn't use M
}
-def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & hasarch{'AARCH64'}} = {
+def arithChk2{F, M, w:T, x:T if same{F,__mul} and isvec{T} and hasarch{'AARCH64'}} = {
def r12 = mulw{w, x}
rl:= packLo{r12}
rh:= packHi{r12}
@@ -82,24 +82,24 @@ def runner{u, R, F} = {
def run{F, M, w, x} = { show{'todo', c, R, F, w, x}; emit{void,'__builtin_abort'}; w }
- def run{F, M, w:T, x:T & c & R!=u32} = {
+ def run{F, M, w:T, x:T if c and R!=u32} = {
arithChk2{F, M, w, x}
}
- def run{F, M, w, x & u} = tup{F{w, x}, tup{'none'}} # trivial base implementation
+ def run{F, M, w, x if u} = tup{F{w, x}, tup{'none'}} # trivial base implementation
def toggleTop{x:X} = x ^ X**(1<<(elwidth{X}-1))
- def run{F==__sub, M, w:VU, x:VU & c & is_u{VU}} = { # 'b'-'a'
+ def run{F==__sub, M, w:VU, x:VU if c and is_u{VU}} = { # 'b'-'a'
def VS = ty_s{VU}
run{F, M, VS~~toggleTop{w}, VS~~toggleTop{x}}
}
- def run{F, M, w:VU, x:VS & c & is_u{VU} & is_s{VS}} = { # 'a'+3, 'a'-3
+ def run{F, M, w:VU, x:VS if c and is_u{VU} and is_s{VS}} = { # 'a'+3, 'a'-3
def {res, ok} = run{F, M, VS~~toggleTop{w}, x}
tup{toggleTop{VU~~res}, ok}
}
- def run{F==__add, M, w:VS, x:VU & c & is_s{VS} & is_u{VU}} = run{F, M, x, w} # 3+'a' → 'a'+3
+ def run{F==__add, M, w:VS, x:VU if c and is_s{VS} and is_u{VU}} = run{F, M, x, w} # 3+'a' → 'a'+3
- def run{F, M, w:VW, x:VX & c & R==u32 & (same{F,__add} | same{F,__sub})} = { # 'a'+1, 'a'-1
+ def run{F, M, w:VW, x:VX if c and R==u32 and (same{F,__add} | same{F,__sub})} = { # 'a'+1, 'a'-1
r:= F{ty_u{w}, ty_u{x}}
tup{re_el{R, VW}~~r, tup{'homAny', M{r > type{r}**1114111}}}
}
@@ -111,7 +111,7 @@ def runChecks_any{F, vals} = { F{tree_fold{|, each{select{.,1}, vals}}} }
def runChecks{type=='homAny', vals, M} = runChecks_any{homAny, vals}
def runChecks{type=='topAny', vals, M} = runChecks_any{topAny, vals}
def runChecks{type=='none', vals, M} = 0
-def runChecks{type=='~andAllZero', vals, M & ~M{0}} = ~tree_fold{&, each{andAllZero, ...slice{flip{vals}, 1}}}
+def runChecks{type=='~andAllZero', vals, M if ~M{0}} = ~tree_fold{&, each{andAllZero, ...slice{flip{vals}, 1}}}
def runChecks{type=='anyne', vals, M} = {
def i{vals} = {
def {_,xs,ys} = flip{vals}
@@ -182,7 +182,7 @@ fn arithSAf{vw, mode, F, swap, W, X, R}(r:*void, w:u64, x:*void, len:u64) : u64
def run = runner{(R==f64) | (mode>=2), R, F}
def getW{v} = trunc{W, v}
- def getW{v & W==f64} = interp_f64{v}
+ def getW{v if W==f64} = interp_f64{v}
cw:= ty_sc{W, TY}**getW{w}
def unr = tern{mode>=2, 2, 1} # same as in arithAAimpl
diff --git a/src/singeli/src/equal.singeli b/src/singeli/src/equal.singeli
index ade621f3..b80eb02d 100644
--- a/src/singeli/src/equal.singeli
+++ b/src/singeli/src/equal.singeli
@@ -39,9 +39,9 @@ fn equal{W, X}(w:*void, x:*void, l:u64, d:u64) : u1 = {
} else { # bitarr ≡ i8/i16/i32arr
def T = [bulk]X
def sh{c} = c << (width{X}-1)
- def sh{c & X==u8} = T ~~ (re_el{u16,c}<<7)
- def mask{x:X & hasarch{'X86_64'}} = topMask{x}
- def mask{x:X & hasarch{'AARCH64'}} = homMask{andnz{x, ~T**0}}
+ def sh{c if X==u8} = T ~~ (re_el{u16,c}<<7)
+ def mask{x:X if hasarch{'X86_64'}} = topMask{x}
+ def mask{x:X if hasarch{'AARCH64'}} = homMask{andnz{x, ~T**0}}
# TODO compare with doing the comparison in vector registers
badBits:= T ** ~(X~~1)
diff --git a/src/singeli/src/f64.singeli b/src/singeli/src/f64.singeli
index 00dda936..62107337 100644
--- a/src/singeli/src/f64.singeli
+++ b/src/singeli/src/f64.singeli
@@ -6,10 +6,10 @@ def NaN = 0.0/0.0
def isNaN{x:(f64)} = x!=x
def qNaN{x:(u64)} = (x<<1) == (cast{u64, 0x8ff8} << 49)
-def ftrunc{T, x:(f64) & i8==T} = emit{i8, '', x}
-def ftrunc{T, x:(f64) & i16==T} = emit{i16, '', x}
-def ftrunc{T, x:(f64) & i32==T} = emit{i32, '', x} # maybe explicitly use _mm_cvtsd_si32?
-def ftrunc{T, x:(f64) & i64==T} = emit{i64, '', x}
+def ftrunc{T== i8, x:(f64)} = emit{T, '', x}
+def ftrunc{T==i16, x:(f64)} = emit{T, '', x}
+def ftrunc{T==i32, x:(f64)} = emit{T, '', x} # maybe explicitly use _mm_cvtsd_si32?
+def ftrunc{T==i64, x:(f64)} = emit{T, '', x}
def fext{x} = emit{f64, '', x}
def interp_f64{x:(u64)} = emit{f64, 'interp_f64', x}
diff --git a/src/singeli/src/fold.singeli b/src/singeli/src/fold.singeli
index bb132d4e..1f262be5 100644
--- a/src/singeli/src/fold.singeli
+++ b/src/singeli/src/fold.singeli
@@ -3,8 +3,8 @@ include './mask'
def opsh64{op}{v:([4]f64), perm} = op{v, shuf{[4]u64, v, perm}}
def opsh32{op}{v:([2]f64), perm} = op{v, shuf{[4]u32, v, perm}}
-def mix{op, v:([4]f64) & hasarch{'AVX'}} = { def sh=opsh64{op}; sh{sh{v, 4b2301}, 4b1032} }
-def mix{op, v:([2]f64) & hasarch{'X86_64'}} = opsh32{op}{v, 4b1032}
+def mix{op, v:([4]f64) if hasarch{'AVX'}} = { def sh=opsh64{op}; sh{sh{v, 4b2301}, 4b1032} }
+def mix{op, v:([2]f64) if hasarch{'X86_64'}} = opsh32{op}{v, 4b1032}
def reduce_pairwise{op, plog, x:*T, len, init:T} = {
# Pairwise combination to shorten dependency chains
diff --git a/src/singeli/src/hashtab.singeli b/src/singeli/src/hashtab.singeli
index 3a2556fd..502fc1a2 100644
--- a/src/singeli/src/hashtab.singeli
+++ b/src/singeli/src/hashtab.singeli
@@ -29,7 +29,7 @@ def hash_val{x0:(u64)} = {
}
# CRC32
if_inline (hasarch{'SSE4.2'}) require{'x86intrin.h'}
-def hash_val{x:(u32) & hasarch{'SSE4.2'}} = {
+def hash_val{x:(u32) if hasarch{'SSE4.2'}} = {
emit{u32, '_mm_crc32_u32', 0x973afb51, x}
}
diff --git a/src/singeli/src/mask.singeli b/src/singeli/src/mask.singeli
index 598bb439..11433d84 100644
--- a/src/singeli/src/mask.singeli
+++ b/src/singeli/src/mask.singeli
@@ -3,25 +3,25 @@ local def maskInit1{w} = {
merge{(w/8-1)**255, (1<<x)-1, (w/8)**0}
}, iota{8}}}
}
-mask256_1:*u8 = maskInit1{256}; def maskOfBit{T,n & width{T}==256} = load{*[32]u8 ~~ (mask256_1 + (n>>3)^31 + 64*(n&7))}
-mask128_1:*u8 = maskInit1{128}; def maskOfBit{T,n & width{T}==128} = load{*[16]u8 ~~ (mask128_1 + (n>>3)^15 + 32*(n&7))}
+mask256_1:*u8 = maskInit1{256}; def maskOfBit{T,n if width{T}==256} = load{*[32]u8 ~~ (mask256_1 + (n>>3)^31 + 64*(n&7))}
+mask128_1:*u8 = maskInit1{128}; def maskOfBit{T,n if width{T}==128} = load{*[16]u8 ~~ (mask128_1 + (n>>3)^15 + 32*(n&7))}
mask256:*i64 = merge{4 ** -1, 4 ** 0}
local def maskOfImpl{T, n, w} = load{*ty_u{T} ~~ (*u8~~mask256 + 32 - n*(elwidth{T}/8))}
# get homogeneous mask of first n items; 0 ≤ n ≤ vcount{T}
-def maskOf{T,n & width{T}==256} = maskOfImpl{T, n, 256}
-def maskOf{T,n & width{T}==128} = maskOfImpl{T, n, 128}
-def maskOf{T,n & width{T}== 64} = maskOfImpl{T, n, 64}
-
-def anyne{x:T, y:T, M & M{0}==0 & isvec{T}} = ~homAll{x==y}
-def anyne{x:T, y:T, M & M{0}==1 & isvec{T}} = homAny{M{x!=y}}
-def anyne{x:T, y:T, M & M{0}==0 & anyInt{x}} = x!=y
-def anyne{x:T, y:T, M & M{0}==1 & anyInt{x}} = M{x^y} != 0
+def maskOf{T,n if width{T}==256} = maskOfImpl{T, n, 256}
+def maskOf{T,n if width{T}==128} = maskOfImpl{T, n, 128}
+def maskOf{T,n if width{T}== 64} = maskOfImpl{T, n, 64}
+
+def anyne{x:T, y:T, M if M{0}==0 and isvec{T}} = ~homAll{x==y}
+def anyne{x:T, y:T, M if M{0}==1 and isvec{T}} = homAny{M{x!=y}}
+def anyne{x:T, y:T, M if M{0}==0 and anyInt{x}} = x!=y
+def anyne{x:T, y:T, M if M{0}==1 and anyInt{x}} = M{x^y} != 0
def anyneBit{x:T, y:T, M} = ~M{x^y, 'all bits zeroes'}
-def anynePositive{x:T, y:T, M & M{0}==0} = anyne{x, y, M}
-def anynePositive{x:T, y:T, M & M{0}==1 & isvec{T}} = {
+def anynePositive{x:T, y:T, M if M{0}==0} = anyne{x, y, M}
+def anynePositive{x:T, y:T, M if M{0}==1 and isvec{T}} = {
def {n,m} = homMaskX{x==y}
def E = tern{type{m}==u64, u64, u32}
(promote{E,~m} << (width{E}-M{'count'}*n)) != 0
@@ -30,8 +30,8 @@ def anynePositive{x:T, y:T, M & M{0}==1 & isvec{T}} = {
def maskNone{x} = x
def maskNone{x, mode=='all bits zeroes'} = andAllZero{x, x}
def maskAfter{n} = {
- def mask{x:X & isvec{X}} = x & (X~~maskOf{X,n})
- def mask{x:X & anyInt{x}} = x & ((1<<n) - 1)
+ def mask{x:X if isvec{X}} = x & (X~~maskOf{X,n})
+ def mask{x:X if anyInt{x}} = x & ((1<<n) - 1)
def mask{x:X, mode=='all bits zeroes'} = andAllZero{x, X~~maskOfBit{X,n}}
def mask{X, mode=='to sign bits'} = maskOf{X,n}
def mask{X, mode=='to homogeneous bits'} = maskOf{X,n}
@@ -42,7 +42,7 @@ def maskAfter{n} = {
-def loadLowBatch{T, ptr:P, w, n & eltype{P}==eltype{T}} = loadLow{*T ~~ (ptr + n*(w/elwidth{P})), w}
+def loadLowBatch{T, ptr:P, w, n if eltype{P}==eltype{T}} = loadLow{*T ~~ (ptr + n*(w/elwidth{P})), w}
# store vcount{T} items into the n'th batch of ptr elements, compressing the items if needed; masked by M
def storeBatch{ptr:P, n, x:T, M} = {
@@ -63,8 +63,8 @@ def loadBatch{ptr:P, n, T} = {
widen{T, loadLow{*re_el{E0, T} ~~ rpos, vcount{T}*width{E0}}}
}
-def loadBatch {ptr:P, ns, T & istup{ns}} = each{{n } => loadBatch {ptr, n, T }, ns}
-def storeBatch{ptr:P, ns, xs, M & istup{ns}} = each{{n,x} => storeBatch{ptr, n, x, M}, ns, xs}
+def loadBatch {ptr:P, ns, T if istup{ns}} = each{{n } => loadBatch {ptr, n, T }, ns}
+def storeBatch{ptr:P, ns, xs, M if istup{ns}} = each{{n,x} => storeBatch{ptr, n, x, M}, ns, xs}
@@ -74,7 +74,7 @@ def hCast{T,p:*T} = p
def hCast{T,p:(*void)} = *T~~p
def mlExec{i, iter, vars0, bulk, M} = {
- def vproc{p:P & isptr{P}} = p
+ def vproc{p:P if isptr{P}} = p
def vproc{('m')} = tptr{{_}=>M, '!'}
def vproc{{T,p:P}} = tptr{{i} => loadBatch{p, i, T}, {i,x} => storeBatch{p, i, x, M}}
diff --git a/src/singeli/src/neon.singeli b/src/singeli/src/neon.singeli
index dc773fc8..70933be5 100644
--- a/src/singeli/src/neon.singeli
+++ b/src/singeli/src/neon.singeli
@@ -1,157 +1,157 @@
def nvec{T} = 0
-def nvec{T & isvec{T}} = (width{T}==64) | (width{T}==128)
+def nvec{T if isvec{T}} = (width{T}==64) | (width{T}==128)
def nvec{T,w} = 0
-def nvec{T,w & nvec{T}} = elwidth{T}==w
+def nvec{T,w if nvec{T}} = elwidth{T}==w
def nveci = genchk{nvec, isint}
def nvecs = genchk{nvec, issigned}
def nvecu = genchk{nvec, isunsigned}
def nvecf = genchk{nvec, isfloat}
-def reinterpret{T, v & same{'pointer',typekind{T}} & ktup{v}} = { tmp:T=v }
+def reinterpret{T, v if same{'pointer',typekind{T}} and ktup{v}} = { tmp:T=v }
def nty{T} = {
def q = quality{T}
merge{if (q=='i') 's' else q, fmtnat{width{T}}}
}
-def nty{T & isvec{T}} = nty{eltype{T}}
-def ntyp{S, ...S2, T & w128{T}} = merge{S, 'q', ...S2, '_', nty{T}}
-def ntyp{S, ...S2, T & w64{T}} = merge{S, ...S2, '_', nty{T}}
+def nty{T if isvec{T}} = nty{eltype{T}}
+def ntyp{S, ...S2, T if w128{T}} = merge{S, 'q', ...S2, '_', nty{T}}
+def ntyp{S, ...S2, T if w64{T}} = merge{S, ...S2, '_', nty{T}}
def ntyp0{S, T} = merge{S, '_', nty{T}}
-def addwLo{a:T,b:T & w64i{T}} = emit{el_d{T}, ntyp{'vaddl', T}, a, b}
-def subwLo{a:T,b:T & w64i{T}} = emit{el_d{T}, ntyp{'vsubl', T}, a, b}
-def mulwLo{a:T,b:T & w64i{T}} = emit{el_d{T}, ntyp{'vmull', T}, a, b}
-def mulwHi{a:T,b:T & w128i{T}} = emit{el_m{T}, ntyp0{'vmull_high', T}, a, b}
-def mulw {a:T,b:T & w128{T}} = tup{mulwLo{half{a,0}, half{b,0}}, mulwHi{a,b}}
-
-def shrn{a:T, s & w128i{T} & elwidth{T}>8} = { def H=el_h{T}; emit{H, ntyp0{'vshrn_n', T}, a, s} } # a>>s, narrowed
-def shrm{a:T, s, d:T & nvecu{T}} = emit{T, ntyp{'vsri', '_n', T}, d, a, s} # (a>>s) | (d & (mask of new zeroes))
-def shlm{a:T, s, d:T & nvecu{T}} = emit{T, ntyp{'vsli', '_n', T}, d, a, s} # (a<<s) | (d & (mask of new zeroes))
-
-def bitBlend{f:T, t:T, m:M & nvec{T} & nvecu{M,elwidth{T}} & width{T}==width{M}} = emit{T, ntyp{'vbsl', T}, m, t, f}
-def homBlend{f:T, t:T, m:M & nvec{M}} = bitBlend{f, t, m}
-
-def addpw { x:T & nveci{T} & elwidth{T}<=32 } = emit{el_m{T}, ntyp{'vpaddl', T}, x} # add pairwise widening
-def addpwa{a:D, x:T & nveci{T} & elwidth{T}<=32 & D==el_m{T}} = emit{D, ntyp{'vpadal', T}, a, x} # add pairwise widening + accumulate
-def mla{a:T, x:T, y:T & nvec{T}} = emit{T, ntyp{'vmla', T}, a, x, y} # a + x*y
-def mls{a:T, x:T, y:T & nvec{T}} = emit{T, ntyp{'vmls', T}, a, x, y} # a - x*y
-def rbit{x:T & nvecu{T,8}} = emit{T, ntyp{'vrbit', T}, x}
-def rev{w, x:T & w==elwidth{T}} = x
-def rev{w==16, x:T & elwidth{T}<16} = emit{T, ntyp{'vrev16', T}, x} # reverse the order of elements in each w-bit window
-def rev{w==32, x:T & elwidth{T}<32} = emit{T, ntyp{'vrev32', T}, x}
-def rev{w==64, x:T & elwidth{T}<64} = emit{T, ntyp{'vrev64', T}, x}
-def popc{x:T & nvecu{T,8}} = emit{T, ntyp{'vcnt', T}, x}
-def clz{x:T & nvecu{T} & elwidth{T}<=32} = emit{T, ntyp{'vclz', T}, x}
-def cls{x:T & nveci{T} & elwidth{T}<=32} = ty_u{T}~~emit{ty_s{T}, ntyp{'vcls', T}, x}
-
-def fold_add {a:T & nvec{T}} = emit{eltype{T}, ntyp{'vaddv', T}, a}
-def fold_addw{a:T & nveci{T}} = emit{w_d{eltype{T}}, ntyp{'vaddlv', T}, a}
-def fold_min {a:T & nvec{T} & ~nveci{T,64}} = emit{eltype{T}, ntyp{'vminv', T}, a}
-def fold_max {a:T & nvec{T} & ~nveci{T,64}} = emit{eltype{T}, ntyp{'vmaxv', T}, a}
-def vfold{F, x:T & nvec{T} & ~nveci{T,64} & same{F, min}} = fold_min{x}
-def vfold{F, x:T & nvec{T} & ~nveci{T,64} & same{F, max}} = fold_max{x}
-def vfold{F, x:T & nvec{T} & same{F, +}} = fold_add{x}
-
-def storeLow{ptr:P, w, x:T & nvec{T} & w<=64} = { def E=ty_u{w}; storeu{*E~~ptr, extract{re_el{E,T}~~x, 0}} }
-def storeLow{ptr:P, w, x:T & nvec{T} & w==width{T}} = store{*T~~ptr, 0, x}
-
-def loadLow{ptr:P, w & w<=64} = { # a broadcast load
+def addwLo{a:T,b:T if w64i{T}} = emit{el_d{T}, ntyp{'vaddl', T}, a, b}
+def subwLo{a:T,b:T if w64i{T}} = emit{el_d{T}, ntyp{'vsubl', T}, a, b}
+def mulwLo{a:T,b:T if w64i{T}} = emit{el_d{T}, ntyp{'vmull', T}, a, b}
+def mulwHi{a:T,b:T if w128i{T}} = emit{el_m{T}, ntyp0{'vmull_high', T}, a, b}
+def mulw {a:T,b:T if w128{T}} = tup{mulwLo{half{a,0}, half{b,0}}, mulwHi{a,b}}
+
+def shrn{a:T, s if w128i{T} and elwidth{T}>8} = { def H=el_h{T}; emit{H, ntyp0{'vshrn_n', T}, a, s} } # a>>s, narrowed
+def shrm{a:T, s, d:T if nvecu{T}} = emit{T, ntyp{'vsri', '_n', T}, d, a, s} # (a>>s) | (d & (mask of new zeroes))
+def shlm{a:T, s, d:T if nvecu{T}} = emit{T, ntyp{'vsli', '_n', T}, d, a, s} # (a<<s) | (d & (mask of new zeroes))
+
+def bitBlend{f:T, t:T, m:M if nvec{T} and nvecu{M,elwidth{T}} and width{T}==width{M}} = emit{T, ntyp{'vbsl', T}, m, t, f}
+def homBlend{f:T, t:T, m:M if nvec{M}} = bitBlend{f, t, m}
+
+def addpw { x:T if nveci{T} and elwidth{T}<=32 } = emit{el_m{T}, ntyp{'vpaddl', T}, x} # add pairwise widening
+def addpwa{a:D, x:T if nveci{T} and elwidth{T}<=32 and D==el_m{T}} = emit{D, ntyp{'vpadal', T}, a, x} # add pairwise widening + accumulate
+def mla{a:T, x:T, y:T if nvec{T}} = emit{T, ntyp{'vmla', T}, a, x, y} # a + x*y
+def mls{a:T, x:T, y:T if nvec{T}} = emit{T, ntyp{'vmls', T}, a, x, y} # a - x*y
+def rbit{x:T if nvecu{T,8}} = emit{T, ntyp{'vrbit', T}, x}
+def rev{w, x:T if w==elwidth{T}} = x
+def rev{w==16, x:T if elwidth{T}<16} = emit{T, ntyp{'vrev16', T}, x} # reverse the order of elements in each w-bit window
+def rev{w==32, x:T if elwidth{T}<32} = emit{T, ntyp{'vrev32', T}, x}
+def rev{w==64, x:T if elwidth{T}<64} = emit{T, ntyp{'vrev64', T}, x}
+def popc{x:T if nvecu{T,8}} = emit{T, ntyp{'vcnt', T}, x}
+def clz{x:T if nvecu{T} and elwidth{T}<=32} = emit{T, ntyp{'vclz', T}, x}
+def cls{x:T if nveci{T} and elwidth{T}<=32} = ty_u{T}~~emit{ty_s{T}, ntyp{'vcls', T}, x}
+
+def fold_add {a:T if nvec{T}} = emit{eltype{T}, ntyp{'vaddv', T}, a}
+def fold_addw{a:T if nveci{T}} = emit{w_d{eltype{T}}, ntyp{'vaddlv', T}, a}
+def fold_min {a:T if nvec{T} and ~nveci{T,64}} = emit{eltype{T}, ntyp{'vminv', T}, a}
+def fold_max {a:T if nvec{T} and ~nveci{T,64}} = emit{eltype{T}, ntyp{'vmaxv', T}, a}
+def vfold{F, x:T if nvec{T} and ~nveci{T,64} and same{F, min}} = fold_min{x}
+def vfold{F, x:T if nvec{T} and ~nveci{T,64} and same{F, max}} = fold_max{x}
+def vfold{F, x:T if nvec{T} and same{F, +}} = fold_add{x}
+
+def storeLow{ptr:P, w, x:T if nvec{T} and w<=64} = { def E=ty_u{w}; storeu{*E~~ptr, extract{re_el{E,T}~~x, 0}} }
+def storeLow{ptr:P, w, x:T if nvec{T} and w==width{T}} = store{*T~~ptr, 0, x}
+
+def loadLow{ptr:P, w if w<=64} = { # a broadcast load
def T=eltype{P}
def L=re_el{ty_u{w}, T}
T ~~ emit{L, ntyp{'vld1', '_dup', L}, *ty_u{w}~~ptr}
}
-def loadLow{ptr:P, w & w==elwidth{P}} = load{ptr}
+def loadLow{ptr:P, w if w==elwidth{P}} = load{ptr}
-def undefPromote{T, x:X & w64{X} & w128{T} & eltype{T}==eltype{X}} = emit{T, ntyp{'vcombine', X}, x, x} # arm_neon.h doesn't actually provide a way to do this in a 0-instruction way. ¯\_(ツ)_/¯
-def half{x:T, n==0 & w128{T}} = emit{n_h{T}, ntyp0{'vget_low', T}, x}
-def half{x:T, n==1 & w128{T}} = emit{n_h{T}, ntyp0{'vget_high', T}, x}
-def pair{a:T, b:T & w64{T}} = emit{n_d{T}, ntyp0{'vcombine', T}, a, b}
-def copyLane{dst:D, di, src:S, si & w64{D} & nvec{S} & eltype{D}==eltype{S}} = emit{D, ntyp{'vcopy_lane', S}, dst, di, src, si}
-def copyLane{dst:D, di, src:S, si & w128{D} & nvec{S} & eltype{D}==eltype{S}} = emit{D, ntyp{'vcopyq_lane', S}, dst, di, src, si}
-def broadcastSel{x:T, i & nvec{T}} = emit{T, ntyp{'vdup', tern{w128{T},'_laneq','_lane'}, T}, x, i}
-def vshl{a:T, b:T, n & knum{n}} = emit{T, ntyp{'vext', T}, a, b, n}
+def undefPromote{T, x:X if w64{X} and w128{T} and eltype{T}==eltype{X}} = emit{T, ntyp{'vcombine', X}, x, x} # arm_neon.h doesn't actually provide a way to do this in a 0-instruction way. ¯\_(ツ)_/¯
+def half{x:T, n==0 if w128{T}} = emit{n_h{T}, ntyp0{'vget_low', T}, x}
+def half{x:T, n==1 if w128{T}} = emit{n_h{T}, ntyp0{'vget_high', T}, x}
+def pair{a:T, b:T if w64{T}} = emit{n_d{T}, ntyp0{'vcombine', T}, a, b}
+def copyLane{dst:D, di, src:S, si if w64{D} and nvec{S} and eltype{D}==eltype{S}} = emit{D, ntyp{'vcopy_lane', S}, dst, di, src, si}
+def copyLane{dst:D, di, src:S, si if w128{D} and nvec{S} and eltype{D}==eltype{S}} = emit{D, ntyp{'vcopyq_lane', S}, dst, di, src, si}
+def broadcastSel{x:T, i if nvec{T}} = emit{T, ntyp{'vdup', tern{w128{T},'_laneq','_lane'}, T}, x, i}
+def vshl{a:T, b:T, n if knum{n}} = emit{T, ntyp{'vext', T}, a, b, n}
-def zipLo{a:T, b:T & nvec{T}} = emit{T, ntyp{'vzip1', T}, a, b}
-def zipHi{a:T, b:T & nvec{T}} = emit{T, ntyp{'vzip2', T}, a, b}
-def zip{a:T, b:T & nvec{T}} = tup{zipLo{a,b}, zipHi{a,b}}
+def zipLo{a:T, b:T if nvec{T}} = emit{T, ntyp{'vzip1', T}, a, b}
+def zipHi{a:T, b:T if nvec{T}} = emit{T, ntyp{'vzip2', T}, a, b}
+def zip{a:T, b:T if nvec{T}} = tup{zipLo{a,b}, zipHi{a,b}}
-def packLo{x:T, y:T & nvec{T}} = { def H=el_s{T}; emit{H, ntyp{'vuzp1', H}, H~~x, H~~y} }
-def packHi{x:T, y:T & nvec{T}} = { def H=el_s{T}; emit{H, ntyp{'vuzp2', H}, H~~x, H~~y} }
+def packLo{x:T, y:T if nvec{T}} = { def H=el_s{T}; emit{H, ntyp{'vuzp1', H}, H~~x, H~~y} }
+def packHi{x:T, y:T if nvec{T}} = { def H=el_s{T}; emit{H, ntyp{'vuzp2', H}, H~~x, H~~y} }
def packLo{{x, y}} = packLo{x, y}
def packHi{{x, y}} = packHi{x, y}
-def trn1{x:T, y:T & nvec{T}} = emit{T, ntyp{'vtrn1', T}, x, y}
-def trn2{x:T, y:T & nvec{T}} = emit{T, ntyp{'vtrn2', T}, x, y}
+def trn1{x:T, y:T if nvec{T}} = emit{T, ntyp{'vtrn1', T}, x, y}
+def trn2{x:T, y:T if nvec{T}} = emit{T, ntyp{'vtrn2', T}, x, y}
-def sel{L, x:T, i:I & lvec{L,16,8} & w128{T} & nvec{I, 8}} = re_el{eltype{T}, emit{I, ntyp{'vqtbl1',I}, re_el{eltype{I},x}, ty_u{i}}}
+def sel{L, x:T, i:I if lvec{L,16,8} and w128{T} and nvec{I, 8}} = re_el{eltype{T}, emit{I, ntyp{'vqtbl1',I}, re_el{eltype{I},x}, ty_u{i}}}
local def eqqi{A, B} = isint{A} & (quality{A}==quality{B}) # equal quality integers
-def cvt{T==f64, x:X & nveci{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_f64', X}, x}
-def cvt{T==i64, x:X & nvecf{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_s64', X}, x}
-def cvt{T==u64, x:X & nvecf{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_u64', X}, x}
+def cvt{T==f64, x:X if nveci{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_f64', X}, x}
+def cvt{T==i64, x:X if nvecf{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_s64', X}, x}
+def cvt{T==u64, x:X if nvecf{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_u64', X}, x}
-def widen{T, x:X & w64{X} & eqqi{eltype{T},eltype{X}} & elwidth{T}==elwidth{X}*2} = emit{T, ntyp{'vmovl', X}, x}
-def widen{T, x:X & w64{X} & eqqi{eltype{T},eltype{X}} & elwidth{T}> elwidth{X}*2} = widen{T, widen{el_s{T}, x}}
-def widen{T, x:X & w64{X} & isfloat{eltype{T}}!=isfloat{eltype{X}} & elwidth{T}>elwidth{X}} = cvt{eltype{T}, widen{[vcount{T}](to_w{eltype{X},elwidth{T}}), x}}
-def widen{T, x:X & w128{X} & vcount{X}>vcount{T}} = widen{T, half{x,0}}
+def widen{T, x:X if w64{X} and eqqi{eltype{T},eltype{X}} and elwidth{T}==elwidth{X}*2} = emit{T, ntyp{'vmovl', X}, x}
+def widen{T, x:X if w64{X} and eqqi{eltype{T},eltype{X}} and elwidth{T}> elwidth{X}*2} = widen{T, widen{el_s{T}, x}}
+def widen{T, x:X if w64{X} and isfloat{eltype{T}}!=isfloat{eltype{X}} and elwidth{T}>elwidth{X}} = cvt{eltype{T}, widen{[vcount{T}](to_w{eltype{X},elwidth{T}}), x}}
+def widen{T, x:X if w128{X} and vcount{X}>vcount{T}} = widen{T, half{x,0}}
-def narrow{T, x:X & w128{X} & eqqi{T,eltype{X}} & width{T}*2< elwidth{X}} = narrow{T, undefPromote{el_s{X}, narrow{w_h{eltype{X}}, x}}}
-def narrow{T, x:X & w128{X} & eqqi{T,eltype{X}} & width{T}*2==elwidth{X}} = emit{el_h{X}, ntyp0{'vmovn', X}, x}
-def narrow{T, x:X & w128{X} & isfloat{T}!=isfloat{eltype{X}} & width{T}<elwidth{X}} = narrow{T, cvt{to_w{T, elwidth{X}}, x}}
+def narrow{T, x:X if w128{X} and eqqi{T,eltype{X}} and width{T}*2< elwidth{X}} = narrow{T, undefPromote{el_s{X}, narrow{w_h{eltype{X}}, x}}}
+def narrow{T, x:X if w128{X} and eqqi{T,eltype{X}} and width{T}*2==elwidth{X}} = emit{el_h{X}, ntyp0{'vmovn', X}, x}
+def narrow{T, x:X if w128{X} and isfloat{T}!=isfloat{eltype{X}} and width{T}<elwidth{X}} = narrow{T, cvt{to_w{T, elwidth{X}}, x}}
-def narrowUpper{lowRes:L, x:X & w64i{L} & w128{X} & el_d{L}==X} = emit{[vcount{L}*2](eltype{L}), ntyp0{'vmovn_high', X}, lowRes, x}
+def narrowUpper{lowRes:L, x:X if w64i{L} and w128{X} and el_d{L}==X} = emit{[vcount{L}*2](eltype{L}), ntyp0{'vmovn_high', X}, lowRes, x}
def narrowPair{a:T, b:T} = narrowUpper{narrow{w_h{eltype{T}}, a}, b}
-def narrowPair{a:T, b:T & isint{eltype{T}}} = packLo{a, b}
+def narrowPair{a:T, b:T if isint{eltype{T}}} = packLo{a, b}
-def widenUpper{x:T & w128i{T}} = emit{el_m{T}, ntyp0{'vmovl_high', T}, x}
-def widen{x:T & w128{T}} = tup{widen{el_m{T}, x}, widenUpper{x}}
+def widenUpper{x:T if w128i{T}} = emit{el_m{T}, ntyp0{'vmovl_high', T}, x}
+def widen{x:T if w128{T}} = tup{widen{el_m{T}, x}, widenUpper{x}}
def bitAny{x:T} = fold_max{re_el{u32, x}}!=0
def bitAll{x:T} = fold_min{re_el{u32, x}}==0xffff_ffff
-def topAny{x:T & nvec{T}} = fold_min{ty_s{x}}<0
-def topAll{x:T & nvec{T}} = fold_max{ty_s{x}}<0
-def homAny{x:T & nvec{T}} = bitAny{x}
-def homAll{x:T & nvec{T}} = bitAll{x}
+def topAny{x:T if nvec{T}} = fold_min{ty_s{x}}<0
+def topAll{x:T if nvec{T}} = fold_max{ty_s{x}}<0
+def homAny{x:T if nvec{T}} = bitAny{x}
+def homAll{x:T if nvec{T}} = bitAll{x}
-def homMask{x:T & nvecu{T} & elwidth{T}>=vcount{T}} = {
+def homMask{x:T if nvecu{T} and elwidth{T}>=vcount{T}} = {
truncBits{vcount{T}, fold_add{x & make{T, 1<<iota{vcount{T}}}}}
}
-def homMask{x:T & nvecu{T} & T==[16]u8} = {
+def homMask{x:T if nvecu{T} and T==[16]u8} = {
t:= [8]u16~~sel{[16]u8, x, make{[16]u8, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15}}
fold_add{t & make{[8]u16, (1<<iota{8})*0x0101}}
}
-def homMask{a:T,b:T & T==[16]u8} = {
+def homMask{a:T,b:T if T==[16]u8} = {
m:= make{[16]u8, 1<<(iota{16}&7)}
fold_add{addpw{addpw{addp{a&m, b&m}}}<<make{[4]u32,iota{4}*8}}
}
-def homMask{a:T,b:T,c:T,d:T & T==[16]u8} = {
+def homMask{a:T,b:T,c:T,d:T if T==[16]u8} = {
m:= make{[16]u8, 1<<(iota{16}&7)}
t1:= addp{a&m, b&m}
t2:= addp{c&m, d&m}
t3:= addp{t1, t2}
extract{[2]u64~~addp{t3,t3},0}
}
-def homMask{...as & length{as}>1 & elwidth{type{select{as,0}}}>=32} = homMask{...each{{i}=>narrowPair{select{as,i*2},select{as,i*2+1}}, iota{length{as}/2}}}
-def homMask{a:T,b:T & vcount{T}*2<=elwidth{T}} = {
+def homMask{...as if length{as}>1 and elwidth{type{select{as,0}}}>=32} = homMask{...each{{i}=>narrowPair{select{as,i*2},select{as,i*2+1}}, iota{length{as}/2}}}
+def homMask{a:T,b:T if vcount{T}*2<=elwidth{T}} = {
def n = vcount{T}
truncBits{n*2, fold_add{shrm{a,elwidth{T}-n,b} & make{T, (1<<iota{n}) | (1<<(iota{n}+n))}}}
}
-def andAllZero{x:T, y:T & nveci{T}} = ~bitAny{x&y}
+def andAllZero{x:T, y:T if nveci{T}} = ~bitAny{x&y}
-def homMaskX{a:T & eltype{T}!=u64} = {
+def homMaskX{a:T if eltype{T}!=u64} = {
def h = elwidth{T}/2
tup{h, truncBits{vcount{T}*h, extract{[1]u64~~shrn{el_m{T}~~a, h}, 0}}}
}
-def homMaskStoreF{p:P, m:M, v:T & nveci{M} & nvec{T,elwidth{M}} & eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}}
+def homMaskStoreF{p:P, m:M, v:T if nveci{M} and nvec{T,elwidth{M}} and eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}}
diff --git a/src/singeli/src/replicate.singeli b/src/singeli/src/replicate.singeli
index d4b8395d..bc1c6dea 100644
--- a/src/singeli/src/replicate.singeli
+++ b/src/singeli/src/replicate.singeli
@@ -22,7 +22,7 @@ def scan_core{upd, set, scan, rp:pT, wp:W, s:(usz)} = {
}
def indrep_by_sum{T, rp:(*T), wp, s:(usz), js, inc} = {
def scan{ptr, len} = @for (ptr over len) js=ptr+=js
- def scan{ptr, len & width{T}<=32} = {
+ def scan{ptr, len if width{T}<=32} = {
def scanfn = merge{'si_scan_pluswrap_u',fmtnat{width{T}}}
p := *ty_u{eltype{type{ptr}}}~~ptr
emit{void, scanfn, p, p, len, js}; js=load{ptr,len-1}
@@ -103,12 +103,12 @@ rcsh4_lkup:*i8 = shiftright{0, scan{+, fold{|, table{==, rcsh4_dom, iota{64}}}}}
def read_shuf_vecs{l, ellw:(u64), shp:P} = { # tuple of byte selectors in 1<<ellw
def V = eltype{P}
- def double{x:X & hasarch{'AVX2'}} = {
+ def double{x:X if hasarch{'AVX2'}} = {
s:=shuf{[4]u64, x, 4b3120}; s+=s
r:=each{bind{~~,[32]i8},unpackQ{s, s + X**1}}
r
}
- def double{x:X & hasarch{'AARCH64'}} = {
+ def double{x:X if hasarch{'AARCH64'}} = {
s:= x+x
zip{s, s + X**1}
}
@@ -193,8 +193,8 @@ fn rep_const_shuffle_partial4(wv:u64, ellw:u64, x:*i8, r:*i8, n:u64) : void = {
def step = vcount{V} # Bytes written
def wvb = wv << ellw
def hs = (h*step) / wvb # Actual step size in argument elements
- def shufbase{i & hasarch{'AVX2'}} = shuf{[4]u64, load{*V~~(x+i)}, 4b1010}
- def shufbase{i & hasarch{'AARCH64'}} = load{*V~~(x+i)}
+ def shufbase{i if hasarch{'AVX2'}} = shuf{[4]u64, load{*V~~(x+i)}, 4b1010}
+ def shufbase{i if hasarch{'AARCH64'}} = load{*V~~(x+i)}
def shufrun{a, s} = sel{[16]i8, a, s} # happens to be the same across AVX2 & NEON
i:u64 = 0
diff --git a/src/singeli/src/scan.singeli b/src/singeli/src/scan.singeli
index 78f7294c..b6719273 100644
--- a/src/singeli/src/scan.singeli
+++ b/src/singeli/src/scan.singeli
@@ -35,7 +35,7 @@ def scan_post{T, init, x:*T, r:*T, len:(u64), op, pre} = {
# Associative scan ?` if a?b?a = a?b = b?a, used for ⌊⌈
def scan_idem = scan_scal
-fn scan_idem{T, op & hasarch{'X86_64'}}(x:*T, r:*T, len:u64, init:T) : void = {
+fn scan_idem{T, op if hasarch{'X86_64'}}(x:*T, r:*T, len:u64, init:T) : void = {
scan_post{T, init, x, r, len, op, make_scan_idem{T, op}}
}
@@ -54,7 +54,7 @@ export{'si_scan_min_i32', scan_idem_id{i32, min}}; export{'si_scan_max_i32', sca
# Assumes identity is 0
def scan_assoc{op} = {
def shl0{v, k} = shl{[16]u8, v, k/8} # Lanewise
- def shl0{v:V, k==128 & hasarch{'AVX2'}} = {
+ def shl0{v:V, k==128 if hasarch{'AVX2'}} = {
# Broadcast end of lane 0 to entire lane 1
l:= V~~make{[8]i32,0,0,0,-1,0,0,0,0} & spread{v}
sel{[8]i32, l, make{[8]i32, 3*(3<iota{8})}}
@@ -65,7 +65,7 @@ def scan_plus = scan_assoc{+}
# Associative scan
def scan_assoc_0 = scan_scal
-fn scan_assoc_0{T, op & hasarch{'X86_64'}}(x:*T, r:*T, len:u64, init:T) : void = {
+fn scan_assoc_0{T, op if hasarch{'X86_64'}}(x:*T, r:*T, len:u64, init:T) : void = {
# Prefix op on entire AVX register
scan_post{T, init, x, r, len, op, scan_plus}
}
@@ -80,7 +80,7 @@ fn scan_neq{}(p:u64, x:*u64, r:*u64, nw:u64) : void = {
p = -(r>>63) # repeat sign bit
}
}
-fn clmul_scan_ne_any{& hasarch{'PCLMUL'}}(x:*void, r:*void, init:u64, words:u64, mark:u64) : void = {
+fn clmul_scan_ne_any{if hasarch{'PCLMUL'}}(x:*void, r:*void, init:u64, words:u64, mark:u64) : void = {
def V = [2]u64
m := V**mark
def xor64{a, i, carry} = { # carry is 64-bit broadcasted current total
@@ -101,10 +101,10 @@ fn clmul_scan_ne_any{& hasarch{'PCLMUL'}}(x:*void, r:*void, init:u64, words:u64,
storeLow{rv+e, 64, clmul{loadLow{xv+e, 64}, m, 0} ^ c}
}
}
-fn scan_neq{& hasarch{'PCLMUL'}}(init:u64, x:*u64, r:*u64, nw:u64) : void = {
+fn scan_neq{if hasarch{'PCLMUL'}}(init:u64, x:*u64, r:*u64, nw:u64) : void = {
clmul_scan_ne_any{}(*void~~x, *void~~r, init, nw, -(u64~~1))
}
-fn scan_neq{& hasarch{'AVX512BW', 'VPCLMULQDQ', 'GFNI'}}(init:u64, x:*u64, r:*u64, nw:u64) : void = {
+fn scan_neq{if hasarch{'AVX512BW', 'VPCLMULQDQ', 'GFNI'}}(init:u64, x:*u64, r:*u64, nw:u64) : void = {
def V = [8]u64
def sse{a} = make{[2]u64, a, 0}
carry := sse{init}
@@ -136,7 +136,7 @@ fn bcs{T}(x:*u64, r:*T, l:u64) : void = {
c:T = 0
@for (r over i to l) { c+= cast_i{T, bitp_get{x,i}}; r = c }
}
-fn bcs{T & hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = {
+fn bcs{T if hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = {
def U = ty_u{T}
def w = width{T}
def vl= 256 / w
@@ -202,7 +202,7 @@ def addChk{a:T, b:T} = {
def bad = emit{u1, '__builtin_add_overflow', a, b, mem}
tup{bad, load{mem}}
}
-def addChk{a:T, b:T & T==f64} = tup{0, a+b}
+def addChk{a:T, b:T if T==f64} = tup{0, a+b}
def widenFull{E, xs} = {
merge{...each{{x:X} => {
@@ -219,9 +219,9 @@ def widenFull{E, xs} = {
}, xs}}
}
-def floor{x & knum{x}} = x - (x%1)
-def maxabsval{T & issigned{T}} = -minvalue{T}
-def maxsafeint{T & issigned{T}} = maxvalue{T}
+def floor{x if knum{x}} = x - (x%1)
+def maxabsval{T if issigned{T}} = -minvalue{T}
+def maxsafeint{T if issigned{T}} = maxvalue{T}
def maxsafeint{T==f64} = 1<<53
fn plus_scan{X, R, O}(x:*X, c:R, r:*R, len:u64) : O = {
diff --git a/src/singeli/src/scan_common.singeli b/src/singeli/src/scan_common.singeli
index d4cf5ad1..0d861f2e 100644
--- a/src/singeli/src/scan_common.singeli
+++ b/src/singeli/src/scan_common.singeli
@@ -1,9 +1,9 @@
# Used by scan.singeli and bins.singeli
def sel8{v:V, t} = sel{[16]u8, v, make{re_el{i8,V}, t}}
-def sel8{v:V, t & w256{V} & istup{t} & length{t}==16} = sel8{v, merge{t,t}}
+def sel8{v:V, t if w256{V} and istup{t} and length{t}==16} = sel8{v, merge{t,t}}
-def shuf{T, v, n & istup{n}} = shuf{T, v, base{4,n}}
+def shuf{T, v, n if istup{n}} = shuf{T, v, base{4,n}}
local def rev{t} = { def l=length{t}; def j=l-1; select{j-t, j-range{l}} }
local def rev{up,t} = if (up) t else rev{t}
@@ -19,14 +19,14 @@ def spread{a:VT, ...up} = {
}
# Set all elements with the last element of the input
-def toLast{n:VT, up & hasarch{'X86_64'} & w128{VT}} = {
+def toLast{n:VT, up if hasarch{'X86_64'} and w128{VT}} = {
def l{v, w} = l{zip{up,v}, 2*w}
- def l{v, w & hasarch{'SSSE3'}} = sel8{v, up*(16-w/8)+iota{16}%(w/8)}
+ def l{v, w if hasarch{'SSSE3'}} = sel8{v, up*(16-w/8)+iota{16}%(w/8)}
def l{v, w==32} = shuf{[4]i32, v, 4**(up*3)}
def l{v, w==64} = shuf{[4]i32, v, (2*up) + tup{0,1,0,1}}
l{n, elwidth{VT}}
}
-def toLast{n:VT, up & hasarch{'AVX2'} & w256{VT}} = {
+def toLast{n:VT, up if hasarch{'AVX2'} and w256{VT}} = {
if (elwidth{VT}<=32) sel{[8]i32, spread{n,up}, [8]i32**(up*7)}
else shuf{[4]u64, n, 4**(up*3)}
}
@@ -51,17 +51,17 @@ def make_scan_idem{T, op, up} = {
def id = make{V, merger{c**get_id{op,T}, (width{V}/w-c)**0}}
(if (up) shl else shr){[16]u8, v, k/8} | id
}
- def shb{v, k & hasarch{'SSSE3'}} = sel8{v, shift{k/8,16}}
- def shb{v, k & k>=32} = shuf{[4]u32, v, shift{k/32,4}}
- def shb{v, k & k==128 & hasarch{'AVX2'}} = {
+ def shb{v, k if hasarch{'SSSE3'}} = sel8{v, shift{k/8,16}}
+ def shb{v, k if k>=32} = shuf{[4]u32, v, shift{k/32,4}}
+ def shb{v, k if k==128 and hasarch{'AVX2'}} = {
# After lanewise scan, broadcast end of lane 0 to entire lane 1
sel{[8]i32, spread{v,up}, make{[8]i32, rev{up,3*(3<iota{8})}}}
}
prefix_byshift{op, shb}
}
def make_scan_idem{T==f64, op, up} = {
- def sc{a:T & vcount{T}==2} = op{a, zip{~up,a}}
- def sc{a:T & hasarch{'AVX2'} & w256{T}} = {
+ def sc{a:T if vcount{T}==2} = op{a, zip{~up,a}}
+ def sc{a:T if hasarch{'AVX2'} and w256{T}} = {
def sh{s, a} = op{a, shuf{[4]u64, a, rev{up,s}}}
sh{tup{0,1,1,1},sh{tup{0,0,2,2},a}}
}
diff --git a/src/singeli/src/search.singeli b/src/singeli/src/search.singeli
index e4e7c1d3..b3f9f3de 100644
--- a/src/singeli/src/search.singeli
+++ b/src/singeli/src/search.singeli
@@ -79,7 +79,7 @@ def simd_bittab = hasarch{'SSSE3'}
def bittab_init{tab, z} = {
@for (t in *TI~~tab over 256) t = z
}
-def bittab_init{tab, z & simd_bittab} = {
+def bittab_init{tab, z if simd_bittab} = {
init:= VI**z
@unroll (t in *VI~~tab over 256/vcount{VI}) t = init
}
@@ -124,7 +124,7 @@ def bittab_lookup{x0:(*void), n:(u64), r0:(*void), tab:(*void)} = {
x+=k; rem-=k; ++r
}
}
-def bittab_lookup{x0:(*void), n:(u64), r0:(*void), tab:(*void) & simd_bittab} = {
+def bittab_lookup{x0:(*void), n:(u64), r0:(*void), tab:(*void) if simd_bittab} = {
def {bitsel, _} = bittab_selector{readbytes{*VI~~tab}}
def k = vcount{VI}
@for (x in *VI~~x0, r in *ty_u{k}~~r0 over cdiv{n,k}) r = bitsel{x}
@@ -295,11 +295,11 @@ def acc{unr, init:T} = {
else a1 = F{a1}
}
}
-def isI64{x:T & eltype{T}==f64 & hasarch{'AARCH64'}} = x == cvt{f64, cvt{i64, x}}
-def isI64{x:T & eltype{T}==f64 & hasarch{'SSE4.1'}} = (x==floor{x}) & (abs{x}<=T**(1<<53))
+def isI64{x:T if eltype{T}==f64 and hasarch{'AARCH64'}} = x == cvt{f64, cvt{i64, x}}
+def isI64{x:T if eltype{T}==f64 and hasarch{'SSE4.1'}} = (x==floor{x}) & (abs{x}<=T**(1<<53))
def maskBlend{b:T, x:T, M} = x
-def maskBlend{b:T, x:T, M & M{0}} = homBlend{b, x, M{T, 'to homogeneous bits'}}
+def maskBlend{b:T, x:T, M if M{0}} = homBlend{b, x, M{T, 'to homogeneous bits'}}
fn getRange{E}(x0:*void, res:*i64, n:u64) : u1 = {
assert{n>0}
@@ -600,7 +600,7 @@ fn hashtab{T, name}(rpi:*rty{name}, iv:*void, mi:usz, fv:*void, ni:usz, links:it
def try_vec_memb{..._} = {}
def try_vec_memb{T, hash, sz, sh, maxh, has_maxh, swap, rp, fp, n, done
- & hasarch{'SSE4.2'} & T==u32} = {
+ if hasarch{'SSE4.2'} and T==u32} = {
# Hash h wants bin h>>sh, so the offset for h in slot i is (in infite-precision ints)
# i-h>>sh = i+((1<<sh-1)-h)>>sh = (((i+1)<<sh-1)-h)>>sh
# We maintain io = (i+1)<<sh-1
diff --git a/src/singeli/src/select.singeli b/src/singeli/src/select.singeli
index 55fb35ab..983e4e2b 100644
--- a/src/singeli/src/select.singeli
+++ b/src/singeli/src/select.singeli
@@ -9,11 +9,11 @@ include 'util/tup'
# def:T - masked original content
# b:B - pointer to data to index; if width{B}<elwidth{T}, padding bytes are garbage read after wanted position
# idx - actual (unscaled) index list
-def gather{d:T, b:B, idx:([8]i32), M & w256{T,32}} = {
+def gather{d:T, b:B, idx:([8]i32), M if w256{T,32}} = {
if (M{0}) T ~~ emit{[8]i32, '_mm256_mask_i32gather_epi32', d, *void~~b, idx, M{T,'to sign bits'}, elwidth{B}/8}
else T ~~ emit{[8]i32, '_mm256_i32gather_epi32', *void~~b, idx, elwidth{B}/8}
}
-def gather{d:T, b:B, idx:([4]i32), M & w256{T,64}} = {
+def gather{d:T, b:B, idx:([4]i32), M if w256{T,64}} = {
if (M{0}) T ~~ emit{[4]i64, '_mm256_mask_i32gather_epi64', d, *void~~b, idx, M{T,'to sign bits'}, elwidth{B}/8}
else T ~~ emit{[4]i64, '_mm256_i32gather_epi64', *void~~b, idx, elwidth{B}/8}
}
@@ -70,7 +70,7 @@ def makeselx{VI, VD, nsel, xd, logv, cshuf} = {
def bb{c}{f, v} = (if (f) bblendn{c<v}; else bblend{(c&v)==v})
def bs{b, c, x} = cshuf{x, c}
- def bs{b, c, x & length{b}>0} = {
+ def bs{b, c, x if length{b}>0} = {
select{b,0}{each{bs{slice{b,1}, c, .}, x}}
}
diff --git a/src/singeli/src/slash.singeli b/src/singeli/src/slash.singeli
index b22c3b72..2f6cdc9b 100644
--- a/src/singeli/src/slash.singeli
+++ b/src/singeli/src/slash.singeli
@@ -7,8 +7,8 @@ if_inline (hasarch{'X86_64'}) {
include './mask'
include 'util/tup'
-def popcRand{x:T & isint{T} & width{T}==64} = emit{u8, 'rand_popc64', x} # under valgrind, return a random result in the range of possible ones
-def popcRand{x:T & isint{T} & width{T}<=32} = emit{u8, 'rand_popc64', x}
+def popcRand{x:T if isint{T} and width{T}==64} = emit{u8, 'rand_popc64', x} # under valgrind, return a random result in the range of possible ones
+def popcRand{x:T if isint{T} and width{T}<=32} = emit{u8, 'rand_popc64', x}
# Table from l bits to w-bit indices, shifted left by s, and G applied afterwards
def maketab{l,w,s,G} = {
@@ -64,7 +64,7 @@ def for_special_buffered{r, write_len}{vars,begin,sum,iter} = {
} else {
if (has_simd) {
def bufw = bufn * tw
- def vc = tern{hasarch{'X86_64'} & (bufw==128), 128, arch_defvw} / tw;
+ def vc = tern{hasarch{'X86_64'} and bufw==128, 128, arch_defvw} / tw;
def R = [vc]T
@unroll ((ov/vc)>>0) if (end-buf>vc) { store{*R~~r0, 0, load{*R~~buf}}; r0+=vc; buf+=vc }
assert{bufw % width{R} == 0} # to make sure the below doesn't read out-of-bounds on the stack
@@ -118,7 +118,7 @@ def topper{T, U, k, x} = {
itab_4_16:*u64 = maketab{4,16} # 16 elts, 128B
def thresh{c==0, T==i8 } = 32
def thresh{c==0, T==i16} = 16
-fn slash{c==0, T & T<=i16}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
+fn slash{c==0, T if T<=i16}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
def tw = width{T}
def n = 64/tw
def tab = if (tw==8) itab else itab_4_16
@@ -140,9 +140,9 @@ fn slash{c==0, T & T<=i16}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
# i16 /w & i32 x+/w; 8 elts/iter; 64 bit table input, expanded to 128 or 256 via topper
def simd128{} = hasarch{'X86_64'} | hasarch{'AARCH64'}
-def thresh{c==0, T==i16 & simd128{}} = 32
-def thresh{c==0, T==i32 & simd128{}} = 16
-fn slash{c==0, T & simd128{} & i16<=T & T<=i32}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
+def thresh{c==0, T==i16 if simd128{}} = 32
+def thresh{c==0, T==i32 if simd128{}} = 16
+fn slash{c==0, T if simd128{} and i16<=T and T<=i32}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
def I = [16]i8
j := I**(if (T==i16) 0 else cast_i{i8,x})
def {top, inctop} = topper{T, I, 8, x}
@@ -161,9 +161,9 @@ fn slash{c==0, T & simd128{} & i16<=T & T<=i32}(w:*u64, x:arg{c,T}, r:*T, l:u64,
# i8 & i16 w/x; 128 bits/iter; [16]i8 shuffle
def shufb128{} = hasarch{'SSSE3'} | hasarch{'AARCH64'}
-def thresh{c==1, T==i8 & shufb128{}} = 64
-def thresh{c==1, T==i16 & shufb128{}} = 32
-fn slash{c==1, T & T<=i16 & shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
+def thresh{c==1, T==i8 if shufb128{}} = 64
+def thresh{c==1, T==i16 if shufb128{}} = 32
+fn slash{c==1, T if T<=i16 and shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
def V = [16]i8
@for_special_buffered{r,8} (w in *u8~~wp over i to sum) {
ind := load{itab, w}
@@ -179,8 +179,8 @@ fn slash{c==1, T & T<=i16 & shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u6
# i32 w/x; 8 elts/iter into 2 steps; [16]i8 shuffle
i32tab:*u32 = maketab{4,8,2} # 16 elts, 64B
-def thresh{c==1, T==i32 & shufb128{}} = 8
-fn slash{c==1, T==i32 & shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
+def thresh{c==1, T==i32 if shufb128{}} = 8
+fn slash{c==1, T==i32 if shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
def V = [16]i8
expander := make{V, iota{16}>>2}
trail := make{V, tail{2,iota{16}}}
@@ -202,9 +202,9 @@ fn slash{c==1, T==i32 & shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) :
# i32 & i64 w/x & x+/w; 256 bits/step, 8 elts/iter; [8]i32 shuffle
i64tab:*u64 = maketab{4,16,1,{x}=>(1+x)*0x100 + x} # 16 elts, 128B
-def thresh{c, T==i32 & hasarch{'AVX2'}} = 32
-def thresh{c, T==i64 & hasarch{'AVX2'}} = 8
-fn slash{c, T & hasarch{'AVX2'} & T>=i32}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
+def thresh{c, T==i32 if hasarch{'AVX2'}} = 32
+def thresh{c, T==i64 if hasarch{'AVX2'}} = 8
+fn slash{c, T if hasarch{'AVX2'} and T>=i32}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
def tw = width{T}
def V = [8]u32
expander := make{[32]u8, merge{...each{tup{., ... 3**128}, iota{8}>>lb{tw/32}}}}
@@ -235,11 +235,11 @@ fn slash{c, T & hasarch{'AVX2'} & T>=i32}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:
}
# everything; 512 bits/iter; AVX-512 compress
-def thresh{c, T==i8 & hasarch{'AVX512VBMI2'}} = 256
-def thresh{c, T==i16 & hasarch{'AVX512VBMI2'}} = 128
-def thresh{c, T==i32 & hasarch{'AVX512F'}} = 64
-def thresh{c, T==i64 & hasarch{'AVX512F'}} = 16
-fn slash{c, T & hasarch{if (width{T}>=32) 'AVX512F' else 'AVX512VBMI2'}}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
+def thresh{c, T==i8 if hasarch{'AVX512VBMI2'}} = 256
+def thresh{c, T==i16 if hasarch{'AVX512VBMI2'}} = 128
+def thresh{c, T==i32 if hasarch{'AVX512F'}} = 64
+def thresh{c, T==i64 if hasarch{'AVX512F'}} = 16
+fn slash{c, T if hasarch{if (width{T}>=32) 'AVX512F' else 'AVX512VBMI2'}}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = {
def f = fmtnat
def wt = width{T}
def vl = 512/wt
@@ -283,7 +283,7 @@ def pext_popc{x:T, m:T} = {
# - z tells how many bits in the group are NOT used
# - x contains the bits, with z zeros above
def build{k==1} = tup{x&m, ~m}
- def build{k & k > 1} = {
+ def build{k if k > 1} = {
def h = k>>1 # Increase size from h to k
{x,z} := build{h}
def low_s = lowbits{w,k} # Low bit in each new group
@@ -326,12 +326,12 @@ def pext_popc{x:T, m:T} = {
pe := fold{|, x&s0, each{gr, g*slice{iota{k/g},1}}}
tup{pe, o>>(k-g)}
}
- def build{k==32 & hasarch{'AVX2'} & isvec{T}} = {
+ def build{k==32 if hasarch{'AVX2'} and isvec{T}} = {
def S = re_el{ty_u{k}, T}
def c{T,vs} = each{{v}=>T~~v, vs}
c{T, multi_shift{...c{S, build{8}}, 8, k, {s}=>S**s}}
}
- def build{k & ~isvec{T} & k > 8} = {
+ def build{k if not isvec{T} and k > 8} = {
multi_shift{...build{8}, 8, k, {s}=>s}
}
# Final result
@@ -339,9 +339,9 @@ def pext_popc{x:T, m:T} = {
tup{pe, scal{w} - z}
}
-def pext_width {& hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 2
-def thresh_bool{& hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 32
-def pext_popc{x0:V, m0:V & hasarch{'PCLMUL'} & V==[2]u64} = {
+def pext_width {if hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 2
+def thresh_bool{if hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 32
+def pext_popc{x0:V, m0:V if hasarch{'PCLMUL'} and V==[2]u64} = {
def clmul{a, b} = zipLo{...@collect (j to 2) clmul{a,b,j}}
m := m0
x := x0 & m
@@ -359,9 +359,9 @@ def pext_popc{x0:V, m0:V & hasarch{'PCLMUL'} & V==[2]u64} = {
tup{x, @collect (j to 2) popc{extract{m0,j}}}
}
-def pext_width {& fast_BMI2{}} = 1
-def thresh_bool{& fast_BMI2{}} = 512
-def pext_popc{x:T, m:T & fast_BMI2{} & T==u64} = tup{pext{x, m}, popc{m}}
+def pext_width {if fast_BMI2{}} = 1
+def thresh_bool{if fast_BMI2{}} = 512
+def pext_popc{x:T, m:T if fast_BMI2{} and T==u64} = tup{pext{x, m}, popc{m}}
fn compress_bool(w:*u64, x:*u64, r:*u64, n:u64) : void = {
cw:u64 = 0; # current word
@@ -375,7 +375,7 @@ fn compress_bool(w:*u64, x:*u64, r:*u64, n:u64) : void = {
}
ro = ro2%64
}
- def extract{t, i & istup{t}} = select{t,i}
+ def extract{t, i if istup{t}} = select{t,i}
def v = pext_width{}
if (v > 1) {
def V = [v]u64
diff --git a/src/singeli/src/squeeze.singeli b/src/singeli/src/squeeze.singeli
index a794c995..2d16a10f 100644
--- a/src/singeli/src/squeeze.singeli
+++ b/src/singeli/src/squeeze.singeli
@@ -7,24 +7,24 @@ include './vecfold'
def preserve_negative_zero = 0
# SSE2 versions avoid any 64-bit integer comparsions
-def anySNaN{M, x:T & eltype{T}==u64} = {
+def anySNaN{M, x:T if eltype{T}==u64} = {
homAny{inRangeLen{M{x}<<1, (0xFFE<<52)+2, (1<<52)-2}}
}
-def anySNaN{M, x:T & T==[2]u64 & hasarch{'X86_64'} & ~hasarch{'SSE4.2'}} = {
+def anySNaN{M, x:T if T==[2]u64 and hasarch{'X86_64'} and not hasarch{'SSE4.2'}} = {
topAny{M{andnot{unord{[2]f64~~x, [2]f64~~x}, [2]u64~~([4]u32**0xFFF8_0000 == ([4]u32~~x | [4]u32**0x8000_0000))}}}
}
-def anyNonChar{M, x:T & isvec{T} & eltype{T}==u64} = homAny{M{~inRangeLen{x, cbqn_c32Tag{}<<48, 1<<48}}}
-def anyNonChar{M, x:T & isvec{T} & hasarch{'X86_64'}} = {
+def anyNonChar{M, x:T if isvec{T} and eltype{T}==u64} = homAny{M{~inRangeLen{x, cbqn_c32Tag{}<<48, 1<<48}}}
+def anyNonChar{M, x:T if isvec{T} and hasarch{'X86_64'}} = {
def H = re_el{u32, T}
def ne = H~~x != H**cast_i{u32, cbqn_c32Tag{}<<16}
topAny{M{T~~ne}}
}
-def cvtNarrow{T, x:X & width{T}==elwidth{X}} = cvt{T, x}
-def cvtNarrow{T, x:X & width{T}< elwidth{X}} = narrow{T, x}
-def cvtWiden{T, x:X & elwidth{T}==elwidth{X}} = cvt{eltype{T}, x}
-def cvtWiden{T, x:X & elwidth{T}> elwidth{X}} = widen{T, x}
+def cvtNarrow{T, x:X if width{T}==elwidth{X}} = cvt{T, x}
+def cvtNarrow{T, x:X if width{T}< elwidth{X}} = narrow{T, x}
+def cvtWiden{T, x:X if elwidth{T}==elwidth{X}} = cvt{eltype{T}, x}
+def cvtWiden{T, x:X if elwidth{T}> elwidth{X}} = widen{T, x}
fn squeeze{vw, X, CHR, B}(x0:*void, len:ux) : u32 = {
assert{len>0}
@@ -36,7 +36,7 @@ fn squeeze{vw, X, CHR, B}(x0:*void, len:ux) : u32 = {
# fold with either Max or Bitwise Or, truncating/zero-extending to TE
def foldTotal{TE, x:T} = cast_i{TE, vfold{|, x}}
- def foldTotal{TE, x:T & hasarch{'AARCH64'}} = {
+ def foldTotal{TE, x:T if hasarch{'AARCH64'}} = {
if (elwidth{T}==64) {
if (width{TE}==64 and bulk==2) cast_i{TE, half{x,0} | half{x,1}}
else vfold{max, narrow{TE, x}}
diff --git a/src/singeli/src/sse.singeli b/src/singeli/src/sse.singeli
index a190880c..cc1a6309 100644
--- a/src/singeli/src/sse.singeli
+++ b/src/singeli/src/sse.singeli
@@ -1,32 +1,32 @@
### SSSE3 ###
-def sel{L, x:T, i:I & hasarch{'SSSE3'} & lvec{L,16,8} & w128{T} & w128i{I, 8}} = T ~~ emit{[16]u8, '_mm_shuffle_epi8', v2i{x}, i}
-def vshl{a:T, b:T, n & hasarch{'SSSE3'}} = T~~emit{[16]u8, '_mm_alignr_epi8', v2i{b}, v2i{a}, n*(elwidth{T}/8)}
+def sel{L, x:T, i:I if hasarch{'SSSE3'} and lvec{L,16,8} and w128{T} and w128i{I, 8}} = T ~~ emit{[16]u8, '_mm_shuffle_epi8', v2i{x}, i}
+def vshl{a:T, b:T, n if hasarch{'SSSE3'}} = T~~emit{[16]u8, '_mm_alignr_epi8', v2i{b}, v2i{a}, n*(elwidth{T}/8)}
### SSE4.1 ###
-def packs{a:T,b:T & hasarch{'SSE4.1'} & T==[4]u32} = emit{[ 8]u16, '_mm_packus_epi32', a, b}
-def andAllZero{x:T, y:T & hasarch{'SSE4.1'} & w128i{T}} = emit{u1, '_mm_testz_si128', x, y}
+def packs{a:T,b:T if hasarch{'SSE4.1'} and T==[4]u32} = emit{[ 8]u16, '_mm_packus_epi32', a, b}
+def andAllZero{x:T, y:T if hasarch{'SSE4.1'} and w128i{T}} = emit{u1, '_mm_testz_si128', x, y}
# conversion
-def widen{T==[8]u16, x:X & hasarch{'SSE4.1'} & X==[16]u8} = emit{T, '_mm_cvtepu8_epi16', x}; def widen{T==[8]i16, x:X & hasarch{'SSE4.1'} & X==[16]i8} = emit{T, '_mm_cvtepi8_epi16', x}
-def widen{T==[4]u32, x:X & hasarch{'SSE4.1'} & X==[16]u8} = emit{T, '_mm_cvtepu8_epi32', x}; def widen{T==[4]i32, x:X & hasarch{'SSE4.1'} & X==[16]i8} = emit{T, '_mm_cvtepi8_epi32', x}
-def widen{T==[4]u32, x:X & hasarch{'SSE4.1'} & X==[8]u16} = emit{T, '_mm_cvtepu16_epi32', x}; def widen{T==[4]i32, x:X & hasarch{'SSE4.1'} & X==[8]i16} = emit{T, '_mm_cvtepi16_epi32', x}
-def widen{T==[2]u64, x:X & hasarch{'SSE4.1'} & X==[16]u8} = emit{T, '_mm_cvtepu8_epi64', x}; def widen{T==[2]i64, x:X & hasarch{'SSE4.1'} & X==[16]i8} = emit{T, '_mm_cvtepi8_epi64', x}
-def widen{T==[2]u64, x:X & hasarch{'SSE4.1'} & X==[8]u16} = emit{T, '_mm_cvtepu16_epi64', x}; def widen{T==[2]i64, x:X & hasarch{'SSE4.1'} & X==[8]i16} = emit{T, '_mm_cvtepi16_epi64', x}
-def widen{T==[2]u64, x:X & hasarch{'SSE4.1'} & X==[4]u32} = emit{T, '_mm_cvtepu32_epi64', x}; def widen{T==[2]i64, x:X & hasarch{'SSE4.1'} & X==[4]i32} = emit{T, '_mm_cvtepi32_epi64', x}
-def widen{T==[2]f64, x:X & hasarch{'SSE4.1'} & w128i{X} & elwidth{X}<32} = widen{T, widen{[4]i32, x}}
+def widen{T==[8]u16, x:X if hasarch{'SSE4.1'} and X==[16]u8} = emit{T, '_mm_cvtepu8_epi16', x}; def widen{T==[8]i16, x:X if hasarch{'SSE4.1'} and X==[16]i8} = emit{T, '_mm_cvtepi8_epi16', x}
+def widen{T==[4]u32, x:X if hasarch{'SSE4.1'} and X==[16]u8} = emit{T, '_mm_cvtepu8_epi32', x}; def widen{T==[4]i32, x:X if hasarch{'SSE4.1'} and X==[16]i8} = emit{T, '_mm_cvtepi8_epi32', x}
+def widen{T==[4]u32, x:X if hasarch{'SSE4.1'} and X==[8]u16} = emit{T, '_mm_cvtepu16_epi32', x}; def widen{T==[4]i32, x:X if hasarch{'SSE4.1'} and X==[8]i16} = emit{T, '_mm_cvtepi16_epi32', x}
+def widen{T==[2]u64, x:X if hasarch{'SSE4.1'} and X==[16]u8} = emit{T, '_mm_cvtepu8_epi64', x}; def widen{T==[2]i64, x:X if hasarch{'SSE4.1'} and X==[16]i8} = emit{T, '_mm_cvtepi8_epi64', x}
+def widen{T==[2]u64, x:X if hasarch{'SSE4.1'} and X==[8]u16} = emit{T, '_mm_cvtepu16_epi64', x}; def widen{T==[2]i64, x:X if hasarch{'SSE4.1'} and X==[8]i16} = emit{T, '_mm_cvtepi16_epi64', x}
+def widen{T==[2]u64, x:X if hasarch{'SSE4.1'} and X==[4]u32} = emit{T, '_mm_cvtepu32_epi64', x}; def widen{T==[2]i64, x:X if hasarch{'SSE4.1'} and X==[4]i32} = emit{T, '_mm_cvtepi32_epi64', x}
+def widen{T==[2]f64, x:X if hasarch{'SSE4.1'} and w128i{X} and elwidth{X}<32} = widen{T, widen{[4]i32, x}}
-def narrow{T, x:X & hasarch{'SSE4.1'} & w128i{X,32} & T==i8} = sel{[16]u8, [16]i8~~x, make{[16]i8, 0,4,8,12, 0,0,0,0, 0,0,0,0, 0,0,0,0}}
-def narrow{T, x:X & hasarch{'SSE4.1'} & w128i{X,32} & T==i16} = sel{[16]u8, [8]i16~~x, make{[16]i8, 0,1,4,5, 8,9,12,13, 0,0,0,0, 0,0,0,0}}
+def narrow{T, x:X if hasarch{'SSE4.1'} and w128i{X,32} and T==i8} = sel{[16]u8, [16]i8~~x, make{[16]i8, 0,4,8,12, 0,0,0,0, 0,0,0,0, 0,0,0,0}}
+def narrow{T, x:X if hasarch{'SSE4.1'} and w128i{X,32} and T==i16} = sel{[16]u8, [8]i16~~x, make{[16]i8, 0,1,4,5, 8,9,12,13, 0,0,0,0, 0,0,0,0}}
# mask stuff
-def andAllZero{x:T, y:T & hasarch{'SSE4.1'} & w128i{T}} = emit{u1, '_mm_testz_si128', x, y}
-def topBlend{f:T, t:T, m:M & hasarch{'SSE4.1'} & w128{T} & w128i{M,32}} = T ~~ emit{[4]f32, '_mm_blendv_ps', v2f{f}, v2f{t}, v2f{m}}
-def topBlend{f:T, t:T, m:M & hasarch{'SSE4.1'} & w128{T} & w128i{M,64}} = T ~~ emit{[2]f64, '_mm_blendv_pd', v2d{f}, v2d{t}, v2d{m}}
-def topBlend{f:T, t:T, m:M & hasarch{'SSE4.1'} & w128{T} & w128i{M, 8}} = T ~~ emit{[16]i8, '_mm_blendv_epi8', v2i{f}, v2i{t}, v2i{m}}
+def andAllZero{x:T, y:T if hasarch{'SSE4.1'} and w128i{T}} = emit{u1, '_mm_testz_si128', x, y}
+def topBlend{f:T, t:T, m:M if hasarch{'SSE4.1'} and w128{T} and w128i{M,32}} = T ~~ emit{[4]f32, '_mm_blendv_ps', v2f{f}, v2f{t}, v2f{m}}
+def topBlend{f:T, t:T, m:M if hasarch{'SSE4.1'} and w128{T} and w128i{M,64}} = T ~~ emit{[2]f64, '_mm_blendv_pd', v2d{f}, v2d{t}, v2d{m}}
+def topBlend{f:T, t:T, m:M if hasarch{'SSE4.1'} and w128{T} and w128i{M, 8}} = T ~~ emit{[16]i8, '_mm_blendv_epi8', v2i{f}, v2i{t}, v2i{m}}
# assumes all bits are the same in each mask item
-def homBlend{f:T, t:T, m:M & hasarch{'SSE4.1'} & w128{T} & w128{M} & elwidth{M}!=16} = topBlend{f, t, m}
-def homBlend{f:T, t:T, m:M & hasarch{'SSE4.1'} & w128{T} & w128{M,16}} = topBlend{f, t, [16]i8~~m}
+def homBlend{f:T, t:T, m:M if hasarch{'SSE4.1'} and w128{T} and w128{M} and elwidth{M}!=16} = topBlend{f, t, m}
+def homBlend{f:T, t:T, m:M if hasarch{'SSE4.1'} and w128{T} and w128{M,16}} = topBlend{f, t, [16]i8~~m}
diff --git a/src/singeli/src/sse2.singeli b/src/singeli/src/sse2.singeli
index 8e82ad46..eff66f82 100644
--- a/src/singeli/src/sse2.singeli
+++ b/src/singeli/src/sse2.singeli
@@ -1,37 +1,37 @@
# compact casting for the annoying intrinsic type system
-def v2i{x:T & w128{T}} = if(isint{eltype{T}}) x else [16]u8 ~~ x
-def v2f{x:T & w128{T}} = [4]f32 ~~ x
-def v2d{x:T & w128{T}} = [2]f64 ~~ x
+def v2i{x:T if w128{T}} = if(isint{eltype{T}}) x else [16]u8 ~~ x
+def v2f{x:T if w128{T}} = [4]f32 ~~ x
+def v2d{x:T if w128{T}} = [2]f64 ~~ x
# load & store
-def loadLow{ptr:P, w & w128{eltype{P}} & w== 16} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si16', ptr}
-def loadLow{ptr:P, w & w128{eltype{P}} & w== 32} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si32', ptr}
-def loadLow{ptr:P, w & w128{eltype{P}} & w== 64} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si64', ptr}
-def loadLow{ptr:P, w & w128{eltype{P}} & w==128} = load{ptr}
+def loadLow{ptr:P, w if w128{eltype{P}} and w== 16} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si16', ptr}
+def loadLow{ptr:P, w if w128{eltype{P}} and w== 32} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si32', ptr}
+def loadLow{ptr:P, w if w128{eltype{P}} and w== 64} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si64', ptr}
+def loadLow{ptr:P, w if w128{eltype{P}} and w==128} = load{ptr}
-def storeLow{ptr:P, w, x:T & w128{T} & w== 16} = emit{void, '_mm_storeu_si16', ptr, v2i{x}}
-def storeLow{ptr:P, w, x:T & w128{T} & w== 32} = emit{void, '_mm_storeu_si32', ptr, v2i{x}}
-def storeLow{ptr:P, w, x:T & w128{T} & w== 64} = emit{void, '_mm_storeu_si64', ptr, v2i{x}}
-def storeLow{ptr:P, w, x:T & w128{T} & w==128} = store{*T~~ptr, 0, x}
+def storeLow{ptr:P, w, x:T if w128{T} and w== 16} = emit{void, '_mm_storeu_si16', ptr, v2i{x}}
+def storeLow{ptr:P, w, x:T if w128{T} and w== 32} = emit{void, '_mm_storeu_si32', ptr, v2i{x}}
+def storeLow{ptr:P, w, x:T if w128{T} and w== 64} = emit{void, '_mm_storeu_si64', ptr, v2i{x}}
+def storeLow{ptr:P, w, x:T if w128{T} and w==128} = store{*T~~ptr, 0, x}
# float comparison
-def unord{a:T,b:T & T==[4]f32} = [4]u32~~emit{[4]f32, '_mm_cmpunord_ps', a, b}
-def unord{a:T,b:T & T==[2]f64} = [2]u64~~emit{[2]f64, '_mm_cmpunord_pd', a, b}
+def unord{a:T,b:T if T==[4]f32} = [4]u32~~emit{[4]f32, '_mm_cmpunord_ps', a, b}
+def unord{a:T,b:T if T==[2]f64} = [2]u64~~emit{[2]f64, '_mm_cmpunord_pd', a, b}
# shift
-def shl{S==[16]u8, x:T, n & w128{T}} = T ~~ emit{T, '_mm_bslli_si128', x, n}
-def shr{S==[16]u8, x:T, n & w128{T}} = T ~~ emit{T, '_mm_bsrli_si128', x, n}
+def shl{S==[16]u8, x:T, n if w128{T}} = T ~~ emit{T, '_mm_bslli_si128', x, n}
+def shr{S==[16]u8, x:T, n if w128{T}} = T ~~ emit{T, '_mm_bsrli_si128', x, n}
# integer arith
-def mulh{a:T,b:T & [8]i16==T} = emit{T, '_mm_mulhi_epi16', a, b}
-def mulh{a:T,b:T & [8]u16==T} = emit{T, '_mm_mulhi_epu16', a, b}
-def mul32{a:T,b:T & [2]u64==T} = emit{T, '_mm_mul_epu32', a, b} # reads only low 32 bits of arguments
-def __mul{a:T,b:T & [4]i32==T} = {
+def mulh{a:T,b:T if [8]i16==T} = emit{T, '_mm_mulhi_epi16', a, b}
+def mulh{a:T,b:T if [8]u16==T} = emit{T, '_mm_mulhi_epu16', a, b}
+def mul32{a:T,b:T if [2]u64==T} = emit{T, '_mm_mul_epu32', a, b} # reads only low 32 bits of arguments
+def __mul{a:T,b:T if [4]i32==T} = {
def mu{x, y} = [4]i32 ~~ mul32{[2]u64~~x, [2]u64~~y}
def sw{n, ...vs} = each{{c} => shuf{[4]i32, c, n}, vs}
lo:= mu{a, b}
@@ -44,55 +44,55 @@ def rsqrtE{a:([4]f32)} = emit{[4]f32, '_mm_rsqrt_ps', a}
def rcpE{a:([4]f32)} = emit{[4]f32, '_mm_rcp_ps', a}
# mask stuff
-def andAllZero{x:T, y:T & w128i{T}} = homAll{(x & y) == T**0}
+def andAllZero{x:T, y:T if w128i{T}} = homAll{(x & y) == T**0}
-def topMask{x:T & w128{T, 8}} = emit{u16, '_mm_movemask_epi8', x}
-def topMask{x:T & w128{T, 16}} = topMask{packs{[8]i16~~x, [8]i16**0}}
-def topMask{x:T & w128{T, 32}} = emit{u8, '_mm_movemask_ps', v2f{x}}
-def topMask{x:T & w128{T, 64}} = emit{u8, '_mm_movemask_pd', v2d{x}}
-def homMask{x:T & w128{T}} = topMask{x}
-def homMaskX{a:T & elwidth{T}==16} = tup{2, homMask{re_el{u8,a}}}
-def homMask{a:T, b:T & w128i{T,16}} = homMask{packs{ty_s{a},ty_s{b}}}
+def topMask{x:T if w128{T, 8}} = emit{u16, '_mm_movemask_epi8', x}
+def topMask{x:T if w128{T, 16}} = topMask{packs{[8]i16~~x, [8]i16**0}}
+def topMask{x:T if w128{T, 32}} = emit{u8, '_mm_movemask_ps', v2f{x}}
+def topMask{x:T if w128{T, 64}} = emit{u8, '_mm_movemask_pd', v2d{x}}
+def homMask{x:T if w128{T}} = topMask{x}
+def homMaskX{a:T if elwidth{T}==16} = tup{2, homMask{re_el{u8,a}}}
+def homMask{a:T, b:T if w128i{T,16}} = homMask{packs{ty_s{a},ty_s{b}}}
-def homAny{x:T & w128i{T}} = homMask{[16]u8 ~~ x} != 0
-def homAll{x:T & w128i{T}} = homMask{[16]u8 ~~ x} == 0xffff
+def homAny{x:T if w128i{T}} = homMask{[16]u8 ~~ x} != 0
+def homAll{x:T if w128i{T}} = homMask{[16]u8 ~~ x} == 0xffff
-def topAny{x:T & w128i{T}} = topMask{x} != 0
-def topAll{x:T & w128i{T}} = topMask{x} == (1<<vcount{T})-1
-def topAny{x:T & w128i{T, 16}} = homAny{[8]i16~~x < [8]i16**0}
-def topAll{x:T & w128i{T, 16}} = homAll{[8]i16~~x < [8]i16**0}
+def topAny{x:T if w128i{T}} = topMask{x} != 0
+def topAll{x:T if w128i{T}} = topMask{x} == (1<<vcount{T})-1
+def topAny{x:T if w128i{T, 16}} = homAny{[8]i16~~x < [8]i16**0}
+def topAll{x:T if w128i{T, 16}} = homAll{[8]i16~~x < [8]i16**0}
# bits of other things SSE2 has
-def packs{a:T,b:T & T==[8]i16} = emit{[16]i8, '_mm_packs_epi16', a, b}
-def packs{a:T,b:T & T==[4]i32} = emit{[ 8]i16, '_mm_packs_epi32', a, b}
-def packs{a:T,b:T & T==[8]u16} = emit{[16]u8, '_mm_packus_epi16', a, b}
-def packQ{a:T,b:T & w128i{T}} = packs{a,b}
+def packs{a:T,b:T if T==[8]i16} = emit{[16]i8, '_mm_packs_epi16', a, b}
+def packs{a:T,b:T if T==[4]i32} = emit{[ 8]i16, '_mm_packs_epi32', a, b}
+def packs{a:T,b:T if T==[8]u16} = emit{[16]u8, '_mm_packus_epi16', a, b}
+def packQ{a:T,b:T if w128i{T}} = packs{a,b}
-def zipLo{a:T, b:T & w128i{T}} = emit{T, merge{'_mm_unpacklo_epi',fmtnat{elwidth{T}}}, a, b}
-def zipHi{a:T, b:T & w128i{T}} = emit{T, merge{'_mm_unpackhi_epi',fmtnat{elwidth{T}}}, a, b}
-def zipLo{a:T, b:T & w128f{T}} = emit{T, merge{'_mm_unpacklo_p',if (elwidth{T}==32) 's' else 'd'}, a, b}
-def zipHi{a:T, b:T & w128f{T}} = emit{T, merge{'_mm_unpackhi_p',if (elwidth{T}==32) 's' else 'd'}, a, b}
-def zip{a:T, b:T & w128i{T}} = tup{zipLo{a,b}, zipHi{a,b}}
+def zipLo{a:T, b:T if w128i{T}} = emit{T, merge{'_mm_unpacklo_epi',fmtnat{elwidth{T}}}, a, b}
+def zipHi{a:T, b:T if w128i{T}} = emit{T, merge{'_mm_unpackhi_epi',fmtnat{elwidth{T}}}, a, b}
+def zipLo{a:T, b:T if w128f{T}} = emit{T, merge{'_mm_unpacklo_p',if (elwidth{T}==32) 's' else 'd'}, a, b}
+def zipHi{a:T, b:T if w128f{T}} = emit{T, merge{'_mm_unpackhi_p',if (elwidth{T}==32) 's' else 'd'}, a, b}
+def zip{a:T, b:T if w128i{T}} = tup{zipLo{a,b}, zipHi{a,b}}
-def unpackQ{a:T, b:T & w128{T}} = mzip{a, b}
+def unpackQ{a:T, b:T if w128{T}} = mzip{a, b}
-def shuf{L, x:T, n & w128{T} & lvec{L,4,32} & knum{n}} = T ~~ emit{[4]i32, '_mm_shuffle_epi32', v2i{x}, n}
+def shuf{L, x:T, n if w128{T} and lvec{L,4,32} and knum{n}} = T ~~ emit{[4]i32, '_mm_shuffle_epi32', v2i{x}, n}
def shuf16Lo{x:T, n} = T~~emit{[8]i16, '_mm_shufflelo_epi16', x, n}
def shuf16Hi{x:T, n} = T~~emit{[8]i16, '_mm_shufflehi_epi16', x, n}
-def homBlend{f:T, t:T, m:M & w128{T} & w128i{M,elwidth{T}}} = T ~~ ((M~~t & m) | (M~~f &~ m))
-def homMaskStoreF{p:P, m:M, v:T & w128i{M} & w128{T,elwidth{M}} & eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}}
+def homBlend{f:T, t:T, m:M if w128{T} and w128i{M,elwidth{T}}} = T ~~ ((M~~t & m) | (M~~f &~ m))
+def homMaskStoreF{p:P, m:M, v:T if w128i{M} and w128{T,elwidth{M}} and eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}}
-def widen{T, x:X & w128i{T} & w128i{X} & w128s{T}==w128s{X} & elwidth{T}>elwidth{X}} = {
+def widen{T, x:X if w128i{T} and w128i{X} and w128s{T}==w128s{X} and elwidth{T}>elwidth{X}} = {
def s{v} = s{mzipLo{v, v}}
- def s{v:V & V==T} = v
+ def s{v:V if V==T} = v
s{x} >> (elwidth{T} - elwidth{X})
}
-def widen{T==[2]f64, x:X & w128s{X} & elwidth{X}<32} = widen{T, widen{[4]i32, x}}
-def widen{T==[2]f64, x:X & X==[4]i32} = emit{T, '_mm_cvtepi32_pd', x}
-def widen{T==[2]f64, x:X & X==[4]f32} = emit{T, '_mm_cvtps_pd', x}
+def widen{T==[2]f64, x:X if w128s{X} and elwidth{X}<32} = widen{T, widen{[4]i32, x}}
+def widen{T==[2]f64, x:X if X==[4]i32} = emit{T, '_mm_cvtepi32_pd', x}
+def widen{T==[2]f64, x:X if X==[4]f32} = emit{T, '_mm_cvtps_pd', x}
def narrow{T==i16, x:([4]i32)} = packs{x,x}
def narrow{T==i8, x:([8]i16)} = packs{x,x}
@@ -104,5 +104,5 @@ def narrow{T==u8, x:([2]u64)} = { def f{v} = narrow{u8, [8]u16~~v}; f{f{f{x}}}}
def narrow{T==u16, x:([2]u64)} = shuf16Lo{[8]u16~~shuf{[4]i32, x, 4b3320}, 4b3320}
def narrow{T==u32, x:([2]u64)} = [4]u32~~shuf{[4]i32, x, 4b3320}
-def narrow{T, x:X & w128f{X,64} & T<i32} = narrow{T, narrow{i32, x}}
+def narrow{T, x:X if w128f{X,64} and T<i32} = narrow{T, narrow{i32, x}}
def narrow{T==i32, x:([2]f64)} = emit{[4]i32, '_mm_cvtpd_epi32', x}
diff --git a/src/singeli/src/transpose.singeli b/src/singeli/src/transpose.singeli
index 8ebab38e..45d8dbfd 100644
--- a/src/singeli/src/transpose.singeli
+++ b/src/singeli/src/transpose.singeli
@@ -25,16 +25,16 @@ def unpack_to{f, l, x} = {
def shuf_pass{x} = each{{v} => shuf{[4]i64, v, 4b3120}, x}
# Square kernel where width is a full vector
-def transpose_square{VT, l, x & hasarch{'AVX2'}} = unpack_to{1, l/2, x}
+def transpose_square{VT, l, x if hasarch{'AVX2'}} = unpack_to{1, l/2, x}
def load2{a:T, b:T} = pair{load{a}, load{b}}
-def store2{a:T, b:T, v:T2 & w128i{eltype{T}} & w256{T2}} = {
+def store2{a:T, b:T, v:T2 if w128i{eltype{T}} and w256{T2}} = {
each{{p:P, i} => store{p, 0, eltype{P}~~half{v,i}}, tup{a,b}, iota{2}}
}
-def load_k {VT, src, l, w & w256{VT}} = each{{i} =>load {*VT~~(src+i*w), 0 }, iota{l}}
-def store_k{VT, dst, x, l, h & w256{VT}} = each{{i,v}=>store{*VT~~(dst+i*h), 0, VT~~v}, iota{l}, x}
-def load_k {VT, src, l, w & w128{VT}} = each{{i} =>{p:=src+ i*w; load2 {*VT~~p, *VT~~(p+l*w) }}, iota{l}}
-def store_k{VT, dst, x, l, h & w128{VT}} = each{{i,v}=>{p:=dst+2*i*h; store2{*VT~~p, *VT~~(p+ h), v}}, iota{l}, x}
+def load_k {VT, src, l, w if w256{VT}} = each{{i} =>load {*VT~~(src+i*w), 0 }, iota{l}}
+def store_k{VT, dst, x, l, h if w256{VT}} = each{{i,v}=>store{*VT~~(dst+i*h), 0, VT~~v}, iota{l}, x}
+def load_k {VT, src, l, w if w128{VT}} = each{{i} =>{p:=src+ i*w; load2 {*VT~~p, *VT~~(p+l*w) }}, iota{l}}
+def store_k{VT, dst, x, l, h if w128{VT}} = each{{i,v}=>{p:=dst+2*i*h; store2{*VT~~p, *VT~~(p+ h), v}}, iota{l}, x}
# Transpose kernel of size kw,kh in size w,h array
def kernel{src:P, dst:P, kw, kh, w, h} = {
diff --git a/src/singeli/src/vecfold.singeli b/src/singeli/src/vecfold.singeli
index f0aadea2..92a57f45 100644
--- a/src/singeli/src/vecfold.singeli
+++ b/src/singeli/src/vecfold.singeli
@@ -1,6 +1,6 @@
# Fold associative/commutative operation across a register
-def vfold{F, x:T & w128{T} & hasarch{'X86_64'}} = {
+def vfold{F, x:T if w128{T} and hasarch{'X86_64'}} = {
c:= x
def EW = elwidth{T}
if (EW<=64) c = F{c, shuf{[4]u32, c, 4b1032}}
@@ -9,4 +9,4 @@ def vfold{F, x:T & w128{T} & hasarch{'X86_64'}} = {
if (EW==8) { v:=extract{[8]i16~~c, 0}; F{cast_i{eltype{T}, v}, cast_i{eltype{T}, v>>8}} }
else extract{c, 0}
}
-def vfold{F, x:T & w256{T} & hasarch{'X86_64'}} = vfold{F, F{half{x, 0}, half{x, 1}}}
+def vfold{F, x:T if w256{T} and hasarch{'X86_64'}} = vfold{F, F{half{x, 0}, half{x, 1}}}