summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordzaima <dzaimagit@gmail.com>2024-05-11 05:52:37 +0300
committerdzaima <dzaimagit@gmail.com>2024-05-11 05:57:30 +0300
commit4f898f38d22ebec96965f7293ed4e40bd24c152d (patch)
tree6be09ce236446254b0040d08d0d60c6096ff123f
parentb2eb26c2e7a391e844bdd2b3813deb1c7553c056 (diff)
various Singeli usage improvements and fixes
-rw-r--r--src/singeli/src/avx.singeli4
-rw-r--r--src/singeli/src/avx2.singeli14
-rw-r--r--src/singeli/src/base.singeli69
-rw-r--r--src/singeli/src/bins.singeli4
-rw-r--r--src/singeli/src/bmi2.singeli8
-rw-r--r--src/singeli/src/cbqnDefs.singeli22
-rw-r--r--src/singeli/src/cmp.singeli2
-rw-r--r--src/singeli/src/copy.singeli8
-rw-r--r--src/singeli/src/dyarith.singeli10
-rw-r--r--src/singeli/src/mask.singeli23
-rw-r--r--src/singeli/src/neon.singeli16
-rw-r--r--src/singeli/src/replicate.singeli2
-rw-r--r--src/singeli/src/scan.singeli2
-rw-r--r--src/singeli/src/scan_common.singeli2
-rw-r--r--src/singeli/src/search.singeli4
-rw-r--r--src/singeli/src/select.singeli6
-rw-r--r--src/singeli/src/squeeze.singeli2
17 files changed, 92 insertions, 106 deletions
diff --git a/src/singeli/src/avx.singeli b/src/singeli/src/avx.singeli
index d748be9d..bb0d9337 100644
--- a/src/singeli/src/avx.singeli
+++ b/src/singeli/src/avx.singeli
@@ -25,8 +25,8 @@ def rcpE{a:([8]f32)} = emit{[8]f32, '_mm256_rcp_ps', a}
# conversion
def half{x:T, i if w256{T} and knum{i}} = n_h{T} ~~ emit{[8]i16, '_mm256_extracti128_si256', v2i{x}, i}
-def half{x:T, i==0 if w256{T}} = n_h{T} ~~ emit{[8]i16, '_mm256_castsi256_si128', v2i{x}}
-def pair{a:T,b:T if width{T}==128} = n_d{T} ~~ emit{[8]i32, '_mm256_setr_m128i', a, b}
+def half{x:T, (0) if w256{T}} = n_h{T} ~~ emit{[8]i16, '_mm256_castsi256_si128', v2i{x}}
+def pair{a:T,b:T if w128{T}} = n_d{T} ~~ emit{[8]i32, '_mm256_setr_m128i', a, b}
def widen{T==[4]f64, x:X if X==[4]i32} = emit{T, '_mm256_cvtepi32_pd', x}
def widen{T==[4]f64, x:X if X==[4]f32} = emit{T, '_mm256_cvtps_pd', x}
diff --git a/src/singeli/src/avx2.singeli b/src/singeli/src/avx2.singeli
index a5ed5721..8623ea36 100644
--- a/src/singeli/src/avx2.singeli
+++ b/src/singeli/src/avx2.singeli
@@ -21,12 +21,12 @@ def mul32{a:T,b:T if [ 4]u64==T} = emit{T, '_mm256_mul_epu32', a, b} # reads o
# structural operations
-def shl{S==[16]u8, x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bslli_epi128', x, n}
-def shr{S==[16]u8, x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bsrli_epi128', x, n}
+def shl{([16]u8), x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bslli_epi128', x, n}
+def shr{([16]u8), x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bsrli_epi128', x, n}
-def blend{L==[8]u16, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[16]i16, '_mm256_blend_epi16', v2i{a}, v2i{b}, m}
-def blend{L==[8]u32, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 8]i32, '_mm256_blend_epi32', v2i{a}, v2i{b}, m}
-def blend{L==[4]u64, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 4]f64, '_mm256_blend_pd', v2d{a}, v2d{b}, m}
+def blend{([8]u16), a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[16]i16, '_mm256_blend_epi16', v2i{a}, v2i{b}, m}
+def blend{([8]u32), a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 8]i32, '_mm256_blend_epi32', v2i{a}, v2i{b}, m}
+def blend{([4]u64), a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 4]f64, '_mm256_blend_pd', v2d{a}, v2d{b}, m}
def topBlend{f:T, t:T, m:M if w256{T, 8} and w256i{M, 8}} = T ~~ emit{[32]i8, '_mm256_blendv_epi8', v2i{f}, v2i{t}, v2i{m}}
def homBlend{f:T, t:T, m:M if w256{T, 8} and w256i{M, 8}} = topBlend{f, t, m}
@@ -93,5 +93,5 @@ def narrow{T, x:X if w256u{X,64} and T==u16} = re_el{T, sel{[16]i8, narrow{u32,x
def narrow{T, x:X if w256u{X,64} and T== u8} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, 4*iota{32}}}}
-def cvt2{T, x:X if T==i32 and X==[4]f64} = emit{[4]i32, '_mm256_cvtpd_epi32', x}
-def cvt2{T, x:X if T==f64 and X==[4]i32} = emit{[4]f64, '_mm256_cvtepi32_pd', x}
+def cvt2{(i32), x:X if X==[4]f64} = emit{[4]i32, '_mm256_cvtpd_epi32', x}
+def cvt2{(f64), x:X if X==[4]i32} = emit{[4]f64, '_mm256_cvtepi32_pd', x}
diff --git a/src/singeli/src/base.singeli b/src/singeli/src/base.singeli
index b8fe0710..62c92cd4 100644
--- a/src/singeli/src/base.singeli
+++ b/src/singeli/src/base.singeli
@@ -16,11 +16,11 @@ def isconst = kcon
def istype = ktyp
def istup = ktup
-def isunsigned{T} = isint{T} & ~issigned{T}
+def isunsigned{T} = isint{T} and not issigned{T}
-def isvec {T} = 0; def isvec {T if istype{T}} = same{typekind{T},'vector'}
-def isprim{T} = 0; def isprim{T if istype{T}} = same{typekind{T},'primitive'}
-def isptr {T} = 0; def isptr {T if istype{T}} = same{typekind{T},'pointer'}
+def isvec {T} = istype{T} and same{typekind{T},'vector'}
+def isprim{T} = istype{T} and same{typekind{T},'primitive'}
+def isptr {T} = istype{T} and same{typekind{T},'pointer'}
def elwidth{T} = width{eltype{T}}
oper &~ andnot infix none 35
@@ -40,16 +40,17 @@ def loadu{p:T if elwidth{T}==8} = load{p}
def storeu{p:T, v:(eltype{T}) if elwidth{T}==8} = store{p, v}
-def reinterpret{T, x:X if T==X} = x
+def reinterpret{T, x:T} = x
def exportN{f, ...ns} = each{export{.,f}, ns}
-def exportT{name, fs} = { v:*type{select{fs,0}} = fs; export{name, v} }
+def exportT{name, fs} = { v:*oneType{fs} = fs; export{name, v} }
# hints
def rare{x if knum{x}} = x
def rare{x:(u1)} = emit{u1, '__builtin_expect', x, 0}
-def assert{x if x==0} = assert{'failed assertion'}
-def assert{x if x==1} = 1
+def assert{c, ...msg} = { if (not same{c,1}) { show{...msg}; 0{} } }
+def assert{(0)} = assert{0, 'failed assertion'}
+def assert{(1)} = 1
def unreachable{} = emit{void, 'si_unreachable'}
def assert{x:(u1)} = { if (not x) emit{void, 'si_unreachable'} }
@@ -61,28 +62,21 @@ def oneVal{{h, ...t}} = {
def oneVal{{}} = {}
def oneType{x} = oneVal{each{type, x}}
-def anyNum{x} = isconst{x} | knum{x}
+def anyNum{x} = knum{x}
def anyNum{x:T} = isprim{T}
-def anyInt{x} = 0
-def anyInt{x if knum{x}} = (x>>0) == x
-def anyInt{x if isreg{x}|isconst{x}} = isint{x}
+def anyInt{x} = knum{x} and (x>>0) == x
+def anyInt{x:T} = isint{T}
# vector width/type checks
-def w64 {T} = 0; def w64 {T if isvec{T}} = width{T}==64
-def w128{T} = 0; def w128{T if isvec{T}} = width{T}==128
-def w256{T} = 0; def w256{T if isvec{T}} = width{T}==256
-def w64 {T,w} = 0; def w64 {T,w if w64{T}} = elwidth{T}==w
-def w128{T,w} = 0; def w128{T,w if w128{T}} = elwidth{T}==w
-def w256{T,w} = 0; def w256{T,w if w256{T}} = elwidth{T}==w
+def w64 {T} = isvec{T} and width{T}==64; def w64 {T,w} = w64{T} and elwidth{T}==w
+def w128{T} = isvec{T} and width{T}==128; def w128{T,w} = w128{T} and elwidth{T}==w
+def w256{T} = isvec{T} and width{T}==256; def w256{T,w} = w256{T} and elwidth{T}==w
# width+type checks
def genchk{B, F} = {
- def r{T} = 0
- def r{T if B{T}} = F{eltype{T}}
- def r{T,w} = 0
- def r{T,w if B{T}} = F{eltype{T}} & (elwidth{T}==w)
- def r{T if ~isvec{T}} = 0
+ def r{T } = B{T} and F{eltype{T}}
+ def r{T,w} = B{T} and F{eltype{T}} and elwidth{T}==w
r
}
def w256i = genchk{w256, isint}; def w128i = genchk{w128, isint}; def w64i = genchk{w64, isint}
@@ -95,10 +89,7 @@ def w256f = genchk{w256, isfloat}; def w128f = genchk{w128, isfloat}; de
def trunc{T, x:U if isint{T} and isint{U} and T<=U} = emit{T, '', x}
def trunc{T, x if knum{x}} = cast{T, x}
-def tern{c, T, F if anyInt{c}} = {
- if(c) T
- else F
-}
+def tern{c, T, F if anyInt{c}} = if(c) T else F
def tern{c, t:T, f:T if anyInt{c}} = {
res:T = f
if (c) res = t
@@ -151,18 +142,18 @@ def collect{vars,begin,end,iter if knum{begin} and knum{end}} = {
}
# convert tuple to number in little-endian base b
-def base{b,l} = if (0==length{l}) 0 else select{l,0}+b*base{b,slice{l,1}}
+def base{b,{}} = 0
+def base{b,{h,...t}} = h + b*base{b,t}
# vector definitions
def arch_defvw = if (hasarch{'AVX2'}) 256 else 128
-def has_simd = hasarch{'X86_64'} | hasarch{'AARCH64'}
-def fast_BMI2{} = if (SLOW_PDEP) 0 else hasarch{'BMI2'}
+def has_simd = hasarch{'X86_64'} or hasarch{'AARCH64'}
+def fast_BMI2{} = hasarch{'BMI2'} and not SLOW_PDEP
# test if vector has a specific width & element type
-def lvec{T, n, w} = 0
-def lvec{T, n, w if isvec{T} and vcount{T}==n and elwidth{T}==w} = 1
+def lvec{T, n, w} = isvec{T} and vcount{T}==n and elwidth{T}==w
# base cases
def {
@@ -235,10 +226,9 @@ def popc{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_popcount', x}
def ctz{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_ctzll', x}
def ctz{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_ctz', x}
def clz{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_clzll', x}
-def clz{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_clz', x}
+def clz{x:T if isint{T} and width{T}==32} = emit{ux, '__builtin_clz', x}
# count-leading-zeros complement, less type-dependent
-def clzc{x:T if isint{T} and width{T}==64} = 64-clz{x}
-def clzc{x:T if isint{T} and width{T}<=32} = 32-clz{x}
+def clzc{x:T if isint{T}} = width{T} - clz{x}
def ceil_log2{n} = clzc{n-1}
@@ -249,7 +239,7 @@ def truncBits{n, v if n==64} = cast_i{u64, v}
# base-2 log of a constant power of two
def lb{n if knum{n} and (n>>1<<1) == n and n>0} = lb{n>>1}+1
-def lb{n==1} = 0
+def lb{(1)} = 0
def zlow{n,x} = (x >> n) << n # zero out n least significant bits
def tail{n,x} = x & ((1<<n) - 1) # get the n least significant bits
@@ -292,8 +282,9 @@ def forUnroll{exp,unr}{vars,begin,end,iter} = {
iter{each{{j}=>i+j, iota{unr}}, vars}
i+= unr
}
- if (unr==2) { if (i!=end) iter{tup{i}, vars} }
- else if (unr>1) {
+ if (unr==2) {
+ if (i!=end) iter{tup{i}, vars}
+ } else if (unr>1) {
if (exp) {
def stop = makelabel{}
each{{j} => {
@@ -313,7 +304,7 @@ def makeBranch{Ts, F} = {
def start = setlabel{}
F{...args}
setlabel{skip}
- {...vs} => { each{=, args, vs}; goto{start} }
+ {...vs} => { args = vs; goto{start} }
}
def makeOptBranch{enable, Ts, F} = if (enable) makeBranch{Ts, F} else 'not defined'
diff --git a/src/singeli/src/bins.singeli b/src/singeli/src/bins.singeli
index 128ab8bb..8273f5d9 100644
--- a/src/singeli/src/bins.singeli
+++ b/src/singeli/src/bins.singeli
@@ -295,7 +295,7 @@ def bin_search_vec{prim, T, w:*T, wn, x:*T, xn, rp, maxwn if hasarch{'AVX2'}} =
}
}
-if_inline (hasarch{'AVX2'}) {
+(if (hasarch{'AVX2'}) {
fn avx2_search_bin{prim, T, maxwn}(rp:*(if (prim=='∊') u64 else i8), w:*void, wn:u64, x:*void, xn:u64) : void = {
bin_search_vec{prim, T, *T~~w, wn, *T~~x, xn, rp, maxwn}
}
@@ -307,7 +307,7 @@ if_inline (hasarch{'AVX2'}) {
'avx2_indexOf_sort',
each{avx2_search_bin{'⊐',.,.}, tup{i8,i16,i32}, tup{64,16,16}}
}
-}
+})
def unroll_sizes = tup{4,1}
fn write{T,k}(r:*void, i:u64, ...vs:k**u64) : void = {
diff --git a/src/singeli/src/bmi2.singeli b/src/singeli/src/bmi2.singeli
index 640ca3bd..e26237ed 100644
--- a/src/singeli/src/bmi2.singeli
+++ b/src/singeli/src/bmi2.singeli
@@ -1,4 +1,4 @@
-def pdep{x:(u64), m:(u64)} = emit{u64, '_pdep_u64', x, m}
-def pdep{x:(u32), m:(u32)} = emit{u32, '_pdep_u32', x, m}
-def pext{x:(u64), m:(u64)} = emit{u64, '_pext_u64', x, m}
-def pext{x:(u32), m:(u32)} = emit{u32, '_pext_u32', x, m}
+def pdep{x:T, m:T if T==u64} = emit{T, '_pdep_u64', x, m}
+def pdep{x:T, m:T if T==u32} = emit{T, '_pdep_u32', x, m}
+def pext{x:T, m:T if T==u64} = emit{T, '_pext_u64', x, m}
+def pext{x:T, m:T if T==u32} = emit{T, '_pext_u32', x, m}
diff --git a/src/singeli/src/cbqnDefs.singeli b/src/singeli/src/cbqnDefs.singeli
index cac96e91..962f1da4 100644
--- a/src/singeli/src/cbqnDefs.singeli
+++ b/src/singeli/src/cbqnDefs.singeli
@@ -5,9 +5,9 @@ def from_B{T, x if T<=u32 and isunsigned{T}} = bcall{T, 'o2cG', x}
def q_f64{x} = bcall{u1, 'q_f64', x}
def q_chr{x} = bcall{u1, 'q_c32', x}
-def q_chr{T,x if T==u8 } = bcall{u1, 'q_c8', x}
-def q_chr{T,x if T==u16} = bcall{u1, 'q_c16', x}
-def q_chr{T,x if T==u32} = bcall{u1, 'q_c32', x}
+def q_chr{(u8 ),x} = bcall{u1, 'q_c8', x}
+def q_chr{(u16),x} = bcall{u1, 'q_c16', x}
+def q_chr{(u32),x} = bcall{u1, 'q_c32', x}
def cbqn_c32Tag{} = emit{u64, '', 'C32_TAG'}
def cbqn_tagTag{} = emit{u64, '', 'TAG_TAG'}
@@ -21,14 +21,14 @@ def cbqn_nspTag{} = emit{u64, '', 'NSP_TAG'}
def cbqn_objTag{} = emit{u64, '', 'OBJ_TAG'}
def cbqn_arrTag{} = emit{u64, '', 'ARR_TAG'}
-def cbqn_elType{T if T==u1 } = 0
-def cbqn_elType{T if T==i8 } = 1
-def cbqn_elType{T if T==i16} = 2
-def cbqn_elType{T if T==i32} = 3
-def cbqn_elType{T if T==f64} = 4
-def cbqn_elType{T if T==u8 } = 5
-def cbqn_elType{T if T==u16} = 6
-def cbqn_elType{T if T==u32} = 7
+def cbqn_elType{(u1 )} = 0
+def cbqn_elType{(i8 )} = 1
+def cbqn_elType{(i16)} = 2
+def cbqn_elType{(i32)} = 3
+def cbqn_elType{(f64)} = 4
+def cbqn_elType{(u8 )} = 5
+def cbqn_elType{(u16)} = 6
+def cbqn_elType{(u32)} = 7
def cbqn_tyArrOffset{} = emit{u64, 'offsetof', 'TyArr', 'a'}
diff --git a/src/singeli/src/cmp.singeli b/src/singeli/src/cmp.singeli
index 04d673d6..0d58ae1c 100644
--- a/src/singeli/src/cmp.singeli
+++ b/src/singeli/src/cmp.singeli
@@ -14,7 +14,7 @@ fn cmpIX(dst:(*u64), len:ux, x:(u64), v:(u1)) : void = {
fillbits{dst, len, v&~nan, x}
}
-def eqne{op} = same{op,__eq}|same{op,__ne}
+def eqne{op} = same{op,__eq} or same{op,__ne}
def pathAS{dst, len, T, op, x if issigned{T}} = {
def R{f if eqne{op}} = {
diff --git a/src/singeli/src/copy.singeli b/src/singeli/src/copy.singeli
index 450cc1aa..a4bb3e9a 100644
--- a/src/singeli/src/copy.singeli
+++ b/src/singeli/src/copy.singeli
@@ -30,7 +30,7 @@ fn copy{X, R}(x: *void, r: *void, l:u64, xRaw: *void) : void = {
else emit{void, 'memcpy', r, x, l*(width{X}/8)}
} else if (R==u64) {
# show{'R==u64', X, R}
- assert{((X==u8) | (X==u16)) | (X==u32)}
+ assert{X==u8 or X==u16 or X==u32}
# TODO could maybe read 256 bits and use unpack to write >256
@maskedLoop{bulk}(sr in tup{'g',rp}, x in tup{RV,xp} over l) sr{x | RV**(cbqn_c32Tag{}<<48)}
} else if (X==u1 and R==u1) {
@@ -85,9 +85,9 @@ def gen{p} = {
def tm = tup{0, 0, 0, 0, 0, 1, 1, 1, 2}
each{{tx0,nx,mx} => {
each{{tr0,nr,mr} => {
- if ((mx==mr or mx==2 or mr==2) and (if (mx==2) mr==1; else 1)) {
- def tr = if (mx==0 and mr==2) f64; else if (tx0==tr0 and mx==1) ty_s{tx0}; else tr0
- def tx = if (mr==0 and mx==2) f64; else if (tx0==tr0 and mx==1) ty_s{tx0}; else tx0
+ if ((mx==mr or mx==2 or mr==2) and (if (mx==2) mr==1 else 1)) {
+ def tr = if (mx==0 and mr==2) f64 else if (tx0==tr0 and mx==1) ty_s{tx0} else tr0
+ def tx = if (mr==0 and mx==2) f64 else if (tx0==tr0 and mx==1) ty_s{tx0} else tx0
export{merge{p, nx, '_', nr}, copy{tx, tr}}
}
}, ts, tn, tm}
diff --git a/src/singeli/src/dyarith.singeli b/src/singeli/src/dyarith.singeli
index 768b81c4..7926f00f 100644
--- a/src/singeli/src/dyarith.singeli
+++ b/src/singeli/src/dyarith.singeli
@@ -108,11 +108,11 @@ def runner{u, R, F} = {
# homAny, topAny already give masked vals; anyne doesn't, and ~andAllZero assumes no masking
def runChecks_any{F, vals} = { F{tree_fold{|, each{select{.,1}, vals}}} }
-def runChecks{type=='homAny', vals, M} = runChecks_any{homAny, vals}
-def runChecks{type=='topAny', vals, M} = runChecks_any{topAny, vals}
-def runChecks{type=='none', vals, M} = 0
-def runChecks{type=='~andAllZero', vals, M if ~M{0}} = ~tree_fold{&, each{andAllZero, ...slice{flip{vals}, 1}}}
-def runChecks{type=='anyne', vals, M} = {
+def runChecks{('homAny'), vals, M} = runChecks_any{homAny, vals}
+def runChecks{('topAny'), vals, M} = runChecks_any{topAny, vals}
+def runChecks{('none'), vals, M} = 0
+def runChecks{('~andAllZero'), vals, M if ~M{0}} = ~tree_fold{&, each{andAllZero, ...slice{flip{vals}, 1}}}
+def runChecks{('anyne'), vals, M} = {
def i{vals} = {
def {_,xs,ys} = flip{vals}
assert{M{0} == 0}
diff --git a/src/singeli/src/mask.singeli b/src/singeli/src/mask.singeli
index 11433d84..d9979b7d 100644
--- a/src/singeli/src/mask.singeli
+++ b/src/singeli/src/mask.singeli
@@ -10,9 +10,9 @@ mask256:*i64 = merge{4 ** -1, 4 ** 0}
local def maskOfImpl{T, n, w} = load{*ty_u{T} ~~ (*u8~~mask256 + 32 - n*(elwidth{T}/8))}
# get homogeneous mask of first n items; 0 ≤ n ≤ vcount{T}
-def maskOf{T,n if width{T}==256} = maskOfImpl{T, n, 256}
-def maskOf{T,n if width{T}==128} = maskOfImpl{T, n, 128}
-def maskOf{T,n if width{T}== 64} = maskOfImpl{T, n, 64}
+def maskOf{T,n if w256{T}} = maskOfImpl{T, n, 256}
+def maskOf{T,n if w128{T}} = maskOfImpl{T, n, 128}
+def maskOf{T,n if w64{T}} = maskOfImpl{T, n, 64}
def anyne{x:T, y:T, M if M{0}==0 and isvec{T}} = ~homAll{x==y}
def anyne{x:T, y:T, M if M{0}==1 and isvec{T}} = homAny{M{x!=y}}
@@ -28,16 +28,16 @@ def anynePositive{x:T, y:T, M if M{0}==1 and isvec{T}} = {
}
def maskNone{x} = x
-def maskNone{x, mode=='all bits zeroes'} = andAllZero{x, x}
+def maskNone{x, ('all bits zeroes')} = andAllZero{x, x}
def maskAfter{n} = {
+ def mask{x:X, ('all bits zeroes')} = andAllZero{x, X~~maskOfBit{X,n}}
+ def mask{X, ('to sign bits')} = maskOf{X,n}
+ def mask{X, ('to homogeneous bits')} = maskOf{X,n}
+ def mask{('count')} = n
+ def mask{{x}} = tup{mask{x}}
def mask{x:X if isvec{X}} = x & (X~~maskOf{X,n})
def mask{x:X if anyInt{x}} = x & ((1<<n) - 1)
- def mask{x:X, mode=='all bits zeroes'} = andAllZero{x, X~~maskOfBit{X,n}}
- def mask{X, mode=='to sign bits'} = maskOf{X,n}
- def mask{X, mode=='to homogeneous bits'} = maskOf{X,n}
- def mask{mode=='count'} = n
- def mask{{x}} = tup{mask{x}}
- def mask{x==0} = 1
+ def mask{(0)} = 1
}
@@ -69,7 +69,7 @@ def storeBatch{ptr:P, ns, xs, M if istup{ns}} = each{{n,x} => storeBatch{ptr, n,
# "harmless" pointer cast that'll only cast void*
-def hCast{T,p} = assert{show{'expected pointer with element',T,'or void but got ',p}}
+def hCast{T,p} = assert{0, 'expected pointer with element',T,'or void but got ',p}
def hCast{T,p:*T} = p
def hCast{T,p:(*void)} = *T~~p
@@ -123,7 +123,6 @@ def maskedLoopPositive{bulk}{vars,begin==0,end:L,iter} = {
# index given is a tuple of batch indexes to process
def muLoop{bulk, unr, fromunr}{vars,begin==0,end,iter} = {
l:u64 = end
- def step = 123123123
m:u64 = l / bulk
if (unr==1) {
diff --git a/src/singeli/src/neon.singeli b/src/singeli/src/neon.singeli
index 70933be5..2f964841 100644
--- a/src/singeli/src/neon.singeli
+++ b/src/singeli/src/neon.singeli
@@ -1,15 +1,11 @@
-def nvec{T} = 0
-def nvec{T if isvec{T}} = (width{T}==64) | (width{T}==128)
-def nvec{T,w} = 0
-def nvec{T,w if nvec{T}} = elwidth{T}==w
+def nvec{T} = isvec{T} and (width{T}==64 or width{T}==128)
+def nvec{T,w} = nvec{T} and elwidth{T}==w
def nveci = genchk{nvec, isint}
def nvecs = genchk{nvec, issigned}
def nvecu = genchk{nvec, isunsigned}
def nvecf = genchk{nvec, isfloat}
-def reinterpret{T, v if same{'pointer',typekind{T}} and ktup{v}} = { tmp:T=v }
-
def nty{T} = {
def q = quality{T}
merge{if (q=='i') 's' else q, fmtnat{width{T}}}
@@ -32,7 +28,7 @@ def shlm{a:T, s, d:T if nvecu{T}} = emit{T, ntyp{'vsli', '_n', T}, d, a, s} # (
def bitBlend{f:T, t:T, m:M if nvec{T} and nvecu{M,elwidth{T}} and width{T}==width{M}} = emit{T, ntyp{'vbsl', T}, m, t, f}
def homBlend{f:T, t:T, m:M if nvec{M}} = bitBlend{f, t, m}
-def addpw { x:T if nveci{T} and elwidth{T}<=32 } = emit{el_m{T}, ntyp{'vpaddl', T}, x} # add pairwise widening
+def addpw { x:T if nveci{T} and elwidth{T}<=32 } = emit{el_m{T}, ntyp{'vpaddl', T}, x} # add pairwise widening
def addpwa{a:D, x:T if nveci{T} and elwidth{T}<=32 and D==el_m{T}} = emit{D, ntyp{'vpadal', T}, a, x} # add pairwise widening + accumulate
def mla{a:T, x:T, y:T if nvec{T}} = emit{T, ntyp{'vmla', T}, a, x, y} # a + x*y
def mls{a:T, x:T, y:T if nvec{T}} = emit{T, ntyp{'vmls', T}, a, x, y} # a - x*y
@@ -67,8 +63,8 @@ def loadLow{ptr:P, w if w==elwidth{P}} = load{ptr}
def undefPromote{T, x:X if w64{X} and w128{T} and eltype{T}==eltype{X}} = emit{T, ntyp{'vcombine', X}, x, x} # arm_neon.h doesn't actually provide a way to do this in a 0-instruction way. ¯\_(ツ)_/¯
-def half{x:T, n==0 if w128{T}} = emit{n_h{T}, ntyp0{'vget_low', T}, x}
-def half{x:T, n==1 if w128{T}} = emit{n_h{T}, ntyp0{'vget_high', T}, x}
+def half{x:T, (0) if w128{T}} = emit{n_h{T}, ntyp0{'vget_low', T}, x}
+def half{x:T, (1) if w128{T}} = emit{n_h{T}, ntyp0{'vget_high', T}, x}
def pair{a:T, b:T if w64{T}} = emit{n_d{T}, ntyp0{'vcombine', T}, a, b}
def copyLane{dst:D, di, src:S, si if w64{D} and nvec{S} and eltype{D}==eltype{S}} = emit{D, ntyp{'vcopy_lane', S}, dst, di, src, si}
def copyLane{dst:D, di, src:S, si if w128{D} and nvec{S} and eltype{D}==eltype{S}} = emit{D, ntyp{'vcopyq_lane', S}, dst, di, src, si}
@@ -125,7 +121,7 @@ def homAll{x:T if nvec{T}} = bitAll{x}
def homMask{x:T if nvecu{T} and elwidth{T}>=vcount{T}} = {
truncBits{vcount{T}, fold_add{x & make{T, 1<<iota{vcount{T}}}}}
}
-def homMask{x:T if nvecu{T} and T==[16]u8} = {
+def homMask{x:T if T==[16]u8} = {
t:= [8]u16~~sel{[16]u8, x, make{[16]u8, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15}}
fold_add{t & make{[8]u16, (1<<iota{8})*0x0101}}
}
diff --git a/src/singeli/src/replicate.singeli b/src/singeli/src/replicate.singeli
index bc1c6dea..444b2ab6 100644
--- a/src/singeli/src/replicate.singeli
+++ b/src/singeli/src/replicate.singeli
@@ -170,7 +170,7 @@ if_inline (hasarch{'AVX2'}) {
def rep_const_shuffle{V, wv, xv:*V, rv:*V, n:(u64)} = rep_const_shuffle{V, wv, get_rep_iter{V, wv}, xv, rv, n}
-} else if (hasarch{'AARCH64'}) {
+} else if_inline (hasarch{'AARCH64'}) {
def rep_iter_from_sh{sh}{x, gen} = {
each{{s} => gen{sel{[16]u8, x, s}}, sh}
diff --git a/src/singeli/src/scan.singeli b/src/singeli/src/scan.singeli
index b6719273..c3f6f628 100644
--- a/src/singeli/src/scan.singeli
+++ b/src/singeli/src/scan.singeli
@@ -146,7 +146,7 @@ fn bcs{T if hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = {
c:= V**0
def ii32 = iota{32}; def bit{k}=bit{k,ii32}; def tail{k}=tail{k,ii32}
- def sums{n} = (if (n==0) tup{0}; else { def s=sums{n-1}; merge{s,s+1} })
+ def sums{n} = (if (n==0) tup{0} else { def s=sums{n-1}; merge{s,s+1} })
def widen{v:T} = unpackQ{shuf{[4]u64, v, 4b3120}, T**0}
def sumlanes{x:(u32)} = {
diff --git a/src/singeli/src/scan_common.singeli b/src/singeli/src/scan_common.singeli
index 0d861f2e..b6c444a5 100644
--- a/src/singeli/src/scan_common.singeli
+++ b/src/singeli/src/scan_common.singeli
@@ -15,7 +15,7 @@ def zip{up, x} = (if (up) zipHi else zipLo){x,x}
def spread{a:VT, ...up} = {
def w = elwidth{VT}
def b = w/8
- if (w<=16) sel8{a,merge{iota{12},(16-b)+iota{4}%b}, ...up}; else a
+ if (w<=16) sel8{a,merge{iota{12},(16-b)+iota{4}%b}, ...up} else a
}
# Set all elements with the last element of the input
diff --git a/src/singeli/src/search.singeli b/src/singeli/src/search.singeli
index b3f9f3de..6770f5da 100644
--- a/src/singeli/src/search.singeli
+++ b/src/singeli/src/search.singeli
@@ -60,14 +60,14 @@ fn copyOrdered{}(r:*f64, x:*f64, len:u64) : u1 = {
0
}
-if_inline (hasarch{'X86_64'} | hasarch{'AARCH64'}) {
+(if (has_simd) {
export{'simd_search_u8', searchOne{u64, u8}}
export{'simd_search_u16', searchOne{u64, u16}}
export{'simd_search_u32', searchOne{u64, u32}}
export{'simd_search_f64', searchOne{f64, f64}}
export{'simd_search_normalizable', searchNormalizable{}}
export{'simd_copy_ordered', copyOrdered{}}
-}
+})
# In-register bit table
diff --git a/src/singeli/src/select.singeli b/src/singeli/src/select.singeli
index 983e4e2b..9d5799e3 100644
--- a/src/singeli/src/select.singeli
+++ b/src/singeli/src/select.singeli
@@ -37,7 +37,7 @@ def shuf_select{ri, rd, TI, w, r, wl, xl, selx} = {
xlf:= VI**cast_i{TI, xl}
@maskedLoop{ri}(cw0 in tup{VI,w}, M in 'm' over i to wl) {
cw:= wrapChk{cw0, VI,xlf, M}
- is:= (if (ext>1) i<<lb{ext}; else i)
+ is:= (if (ext>1) i<<lb{ext} else i)
def se{e, c, o} = {
c2:= shuf{[4]u64, c+c, 4b3120}
each{
@@ -57,7 +57,7 @@ def perm_select{ri, rd, TI, w, r, wl, xl, selx} = {
xlf:= VI**cast_i{TI, xl}
@maskedLoop{ri}(cw0 in tup{VI,w}, M in 'm' over i to wl) {
cw:= wrapChk{cw0, VI,xlf, M}
- is:= (if (ext>1) i<<lb{ext}; else i)
+ is:= (if (ext>1) i<<lb{ext} else i)
def part{o} = widen{[8]i32, re_el{i8, shuf{[4]u64, cw, 4b3210+o}}}
def se{o} = storeExp{r, is+o, selx{part{o}}, M, ext, rd, wl}
each{se, iota{ext}}
@@ -67,7 +67,7 @@ def perm_select{ri, rd, TI, w, r, wl, xl, selx} = {
def makeselx{VI, VD, nsel, xd, logv, cshuf} = {
def bblend {m}{{f,t}} = homBlend{f, t, type{f} ~~ m}
def bblendn{m}{{t,f}} = bblend{m}{tup{f,t}}
- def bb{c}{f, v} = (if (f) bblendn{c<v}; else bblend{(c&v)==v})
+ def bb{c}{f, v} = (if (f) bblendn{c<v} else bblend{(c&v)==v})
def bs{b, c, x} = cshuf{x, c}
def bs{b, c, x if length{b}>0} = {
diff --git a/src/singeli/src/squeeze.singeli b/src/singeli/src/squeeze.singeli
index 2d16a10f..8e63d014 100644
--- a/src/singeli/src/squeeze.singeli
+++ b/src/singeli/src/squeeze.singeli
@@ -124,7 +124,7 @@ fn squeeze{vw, X, CHR, B}(x0:*void, len:ux) : u32 = {
}
int
}
- def acc = { if (length{is}==2) r2; else r1 }
+ def acc = { if (length{is}==2) r2 else r1 }
acc|= M{getAcc{type{acc}, int}}
}