17 files changed, 92 insertions, 106 deletions
diff --git a/src/singeli/src/avx.singeli b/src/singeli/src/avx.singeli
index d748be9d..bb0d9337 100644
--- a/src/singeli/src/avx.singeli
+++ b/src/singeli/src/avx.singeli
@@ -25,8 +25,8 @@ def   rcpE{a:([8]f32)} = emit{[8]f32, '_mm256_rcp_ps', a}
 
 # conversion
 def half{x:T, i if w256{T} and knum{i}} = n_h{T} ~~ emit{[8]i16, '_mm256_extracti128_si256', v2i{x}, i}
-def half{x:T, i==0 if w256{T}}          = n_h{T} ~~ emit{[8]i16, '_mm256_castsi256_si128', v2i{x}}
-def pair{a:T,b:T if width{T}==128} = n_d{T} ~~ emit{[8]i32, '_mm256_setr_m128i', a, b}
+def half{x:T, (0) if w256{T}}           = n_h{T} ~~ emit{[8]i16, '_mm256_castsi256_si128', v2i{x}}
+def pair{a:T,b:T if w128{T}}            = n_d{T} ~~ emit{[8]i32, '_mm256_setr_m128i', a, b}
 
 def widen{T==[4]f64, x:X if X==[4]i32} = emit{T, '_mm256_cvtepi32_pd', x}
 def widen{T==[4]f64, x:X if X==[4]f32} = emit{T, '_mm256_cvtps_pd', x}
diff --git a/src/singeli/src/avx2.singeli b/src/singeli/src/avx2.singeli
index a5ed5721..8623ea36 100644
--- a/src/singeli/src/avx2.singeli
+++ b/src/singeli/src/avx2.singeli
@@ -21,12 +21,12 @@ def mul32{a:T,b:T if [ 4]u64==T} = emit{T, '_mm256_mul_epu32',   a, b} # reads o
 
 # structural operations
 
-def shl{S==[16]u8, x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bslli_epi128', x, n}
-def shr{S==[16]u8, x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bsrli_epi128', x, n}
+def shl{([16]u8), x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bslli_epi128', x, n}
+def shr{([16]u8), x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bsrli_epi128', x, n}
 
-def blend{L==[8]u16, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[16]i16, '_mm256_blend_epi16', v2i{a}, v2i{b}, m}
-def blend{L==[8]u32, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 8]i32, '_mm256_blend_epi32', v2i{a}, v2i{b}, m}
-def blend{L==[4]u64, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 4]f64, '_mm256_blend_pd',    v2d{a}, v2d{b}, m}
+def blend{([8]u16), a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[16]i16, '_mm256_blend_epi16', v2i{a}, v2i{b}, m}
+def blend{([8]u32), a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 8]i32, '_mm256_blend_epi32', v2i{a}, v2i{b}, m}
+def blend{([4]u64), a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 4]f64, '_mm256_blend_pd',    v2d{a}, v2d{b}, m}
 
 def topBlend{f:T, t:T, m:M if w256{T,  8} and w256i{M, 8}} = T ~~ emit{[32]i8, '_mm256_blendv_epi8', v2i{f}, v2i{t}, v2i{m}}
 def homBlend{f:T, t:T, m:M if w256{T,  8} and w256i{M, 8}} = topBlend{f, t, m}
@@ -93,5 +93,5 @@ def narrow{T, x:X if w256u{X,64} and T==u16} = re_el{T, sel{[16]i8, narrow{u32,x
 def narrow{T, x:X if w256u{X,64} and T== u8} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, 4*iota{32}}}}
 
 
-def cvt2{T, x:X if T==i32 and X==[4]f64} = emit{[4]i32, '_mm256_cvtpd_epi32', x}
-def cvt2{T, x:X if T==f64 and X==[4]i32} = emit{[4]f64, '_mm256_cvtepi32_pd', x}
+def cvt2{(i32), x:X if X==[4]f64} = emit{[4]i32, '_mm256_cvtpd_epi32', x}
+def cvt2{(f64), x:X if X==[4]i32} = emit{[4]f64, '_mm256_cvtepi32_pd', x}
diff --git a/src/singeli/src/base.singeli b/src/singeli/src/base.singeli
index b8fe0710..62c92cd4 100644
--- a/src/singeli/src/base.singeli
+++ b/src/singeli/src/base.singeli
@@ -16,11 +16,11 @@ def isconst = kcon
 def istype  = ktyp
 def istup   = ktup
 
-def isunsigned{T} = isint{T} & ~issigned{T}
+def isunsigned{T} = isint{T} and not issigned{T}
 
-def isvec {T} = 0; def isvec {T if istype{T}} = same{typekind{T},'vector'}
-def isprim{T} = 0; def isprim{T if istype{T}} = same{typekind{T},'primitive'}
-def isptr {T} = 0; def isptr {T if istype{T}} = same{typekind{T},'pointer'}
+def isvec {T} = istype{T} and same{typekind{T},'vector'}
+def isprim{T} = istype{T} and same{typekind{T},'primitive'}
+def isptr {T} = istype{T} and same{typekind{T},'pointer'}
 def elwidth{T} = width{eltype{T}}
 
 oper &~ andnot infix none 35
@@ -40,16 +40,17 @@ def loadu{p:T                 if elwidth{T}==8} = load{p}
 def storeu{p:T, v:(eltype{T}) if elwidth{T}==8} = store{p, v}
 
 
-def reinterpret{T, x:X if T==X} = x
+def reinterpret{T, x:T} = x
 def exportN{f, ...ns} = each{export{.,f}, ns}
-def exportT{name, fs} = { v:*type{select{fs,0}} = fs; export{name, v} }
+def exportT{name, fs} = { v:*oneType{fs} = fs; export{name, v} }
 
 
 # hints
 def rare{x if knum{x}} = x
 def rare{x:(u1)} = emit{u1, '__builtin_expect', x, 0}
-def assert{x if x==0} = assert{'failed assertion'}
-def assert{x if x==1} = 1
+def assert{c, ...msg} = { if (not same{c,1}) { show{...msg}; 0{} } }
+def assert{(0)} = assert{0, 'failed assertion'}
+def assert{(1)} = 1
 def unreachable{} = emit{void, 'si_unreachable'}
 def assert{x:(u1)} = { if (not x) emit{void, 'si_unreachable'} }
 
@@ -61,28 +62,21 @@ def oneVal{{h, ...t}} = {
 def oneVal{{}} = {}
 def oneType{x} = oneVal{each{type, x}}
 
-def anyNum{x} = isconst{x} | knum{x}
+def anyNum{x} = knum{x}
 def anyNum{x:T} = isprim{T}
 
-def anyInt{x} = 0
-def anyInt{x if knum{x}} = (x>>0) == x
-def anyInt{x if isreg{x}|isconst{x}} = isint{x}
+def anyInt{x} = knum{x} and (x>>0) == x
+def anyInt{x:T} = isint{T}
 
 # vector width/type checks
-def w64 {T}   = 0; def w64 {T   if isvec{T}} = width{T}==64
-def w128{T}   = 0; def w128{T   if isvec{T}} = width{T}==128
-def w256{T}   = 0; def w256{T   if isvec{T}} = width{T}==256
-def w64 {T,w} = 0; def w64 {T,w if w64{T}}   = elwidth{T}==w
-def w128{T,w} = 0; def w128{T,w if w128{T}}  = elwidth{T}==w
-def w256{T,w} = 0; def w256{T,w if w256{T}}  = elwidth{T}==w
+def w64 {T} = isvec{T} and width{T}==64;  def w64 {T,w} = w64{T}  and elwidth{T}==w
+def w128{T} = isvec{T} and width{T}==128; def w128{T,w} = w128{T} and elwidth{T}==w
+def w256{T} = isvec{T} and width{T}==256; def w256{T,w} = w256{T} and elwidth{T}==w
 
 # width+type checks
 def genchk{B, F} = {
-  def r{T} = 0
-  def r{T if B{T}} = F{eltype{T}}
-  def r{T,w} = 0
-  def r{T,w if B{T}} = F{eltype{T}} & (elwidth{T}==w)
-  def r{T if ~isvec{T}} = 0
+  def r{T  } = B{T} and F{eltype{T}}
+  def r{T,w} = B{T} and F{eltype{T}} and elwidth{T}==w
   r
 }
 def w256i = genchk{w256,      isint};  def w128i = genchk{w128,      isint};  def w64i = genchk{w64,      isint}
@@ -95,10 +89,7 @@ def w256f = genchk{w256,    isfloat};  def w128f = genchk{w128,    isfloat};  de
 def trunc{T, x:U if isint{T} and isint{U} and T<=U} = emit{T, '', x}
 def trunc{T, x if knum{x}} = cast{T, x}
 
-def tern{c, T, F if anyInt{c}} = {
-  if(c) T
-  else F
-}
+def tern{c, T, F if anyInt{c}} = if(c) T else F
 def tern{c, t:T, f:T if anyInt{c}} = {
   res:T = f
   if (c) res = t
@@ -151,18 +142,18 @@ def collect{vars,begin,end,iter if knum{begin} and knum{end}} = {
 }
 
 # convert tuple to number in little-endian base b
-def base{b,l} = if (0==length{l}) 0 else select{l,0}+b*base{b,slice{l,1}}
+def base{b,{}} = 0
+def base{b,{h,...t}} = h + b*base{b,t}
 
 
 
 # vector definitions
 def arch_defvw = if (hasarch{'AVX2'}) 256 else 128
-def has_simd = hasarch{'X86_64'} | hasarch{'AARCH64'}
-def fast_BMI2{} = if (SLOW_PDEP) 0 else hasarch{'BMI2'}
+def has_simd = hasarch{'X86_64'} or hasarch{'AARCH64'}
+def fast_BMI2{} = hasarch{'BMI2'} and not SLOW_PDEP
 
 # test if vector has a specific width & element type
-def lvec{T, n, w} = 0
-def lvec{T, n, w if isvec{T} and vcount{T}==n and elwidth{T}==w} = 1
+def lvec{T, n, w} = isvec{T} and vcount{T}==n and elwidth{T}==w
 
 # base cases
 def {
@@ -235,10 +226,9 @@ def popc{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_popcount', x}
 def ctz{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_ctzll', x}
 def ctz{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_ctz', x}
 def clz{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_clzll', x}
-def clz{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_clz', x}
+def clz{x:T if isint{T} and width{T}==32} = emit{ux, '__builtin_clz', x}
 # count-leading-zeros complement, less type-dependent
-def clzc{x:T if isint{T} and width{T}==64} = 64-clz{x}
-def clzc{x:T if isint{T} and width{T}<=32} = 32-clz{x}
+def clzc{x:T if isint{T}} = width{T} - clz{x}
 
 def ceil_log2{n} = clzc{n-1}
 
@@ -249,7 +239,7 @@ def truncBits{n, v if n==64} = cast_i{u64, v}
 
 # base-2 log of a constant power of two
 def lb{n if knum{n} and (n>>1<<1) == n and n>0} = lb{n>>1}+1
-def lb{n==1} = 0
+def lb{(1)} = 0
 
 def zlow{n,x} = (x >> n) << n    # zero out n least significant bits
 def tail{n,x} = x & ((1<<n) - 1) # get the n least significant bits
@@ -292,8 +282,9 @@ def forUnroll{exp,unr}{vars,begin,end,iter} = {
     iter{each{{j}=>i+j, iota{unr}}, vars}
     i+= unr
   }
-  if (unr==2) { if (i!=end) iter{tup{i}, vars} }
-  else if (unr>1) {
+  if (unr==2) {
+    if (i!=end) iter{tup{i}, vars}
+  } else if (unr>1) {
     if (exp) {
       def stop = makelabel{}
       each{{j} => {
@@ -313,7 +304,7 @@ def makeBranch{Ts, F} = {
   def start = setlabel{}
   F{...args}
   setlabel{skip}
-  {...vs} => { each{=, args, vs}; goto{start} }
+  {...vs} => { args = vs; goto{start} }
 }
 def makeOptBranch{enable, Ts, F} = if (enable) makeBranch{Ts, F} else 'not defined'
 
diff --git a/src/singeli/src/bins.singeli b/src/singeli/src/bins.singeli
index 128ab8bb..8273f5d9 100644
--- a/src/singeli/src/bins.singeli
+++ b/src/singeli/src/bins.singeli
@@ -295,7 +295,7 @@ def bin_search_vec{prim, T, w:*T, wn, x:*T, xn, rp, maxwn if hasarch{'AVX2'}} =
   }
 }
 
-if_inline (hasarch{'AVX2'}) {
+(if (hasarch{'AVX2'}) {
   fn avx2_search_bin{prim, T, maxwn}(rp:*(if (prim=='∊') u64 else i8), w:*void, wn:u64, x:*void, xn:u64) : void = {
     bin_search_vec{prim, T, *T~~w, wn, *T~~x, xn, rp, maxwn}
   }
@@ -307,7 +307,7 @@ if_inline (hasarch{'AVX2'}) {
     'avx2_indexOf_sort',
     each{avx2_search_bin{'⊐',.,.}, tup{i8,i16,i32}, tup{64,16,16}}
   }
-}
+})
 
 def unroll_sizes = tup{4,1}
 fn write{T,k}(r:*void, i:u64, ...vs:k**u64) : void = {
diff --git a/src/singeli/src/bmi2.singeli b/src/singeli/src/bmi2.singeli
index 640ca3bd..e26237ed 100644
--- a/src/singeli/src/bmi2.singeli
+++ b/src/singeli/src/bmi2.singeli
@@ -1,4 +1,4 @@
-def pdep{x:(u64), m:(u64)} = emit{u64, '_pdep_u64', x, m}
-def pdep{x:(u32), m:(u32)} = emit{u32, '_pdep_u32', x, m}
-def pext{x:(u64), m:(u64)} = emit{u64, '_pext_u64', x, m}
-def pext{x:(u32), m:(u32)} = emit{u32, '_pext_u32', x, m}
+def pdep{x:T, m:T if T==u64} = emit{T, '_pdep_u64', x, m}
+def pdep{x:T, m:T if T==u32} = emit{T, '_pdep_u32', x, m}
+def pext{x:T, m:T if T==u64} = emit{T, '_pext_u64', x, m}
+def pext{x:T, m:T if T==u32} = emit{T, '_pext_u32', x, m}
diff --git a/src/singeli/src/cbqnDefs.singeli b/src/singeli/src/cbqnDefs.singeli
index cac96e91..962f1da4 100644
--- a/src/singeli/src/cbqnDefs.singeli
+++ b/src/singeli/src/cbqnDefs.singeli
@@ -5,9 +5,9 @@ def from_B{T, x if T<=u32 and isunsigned{T}} = bcall{T, 'o2cG', x}
 
 def q_f64{x} = bcall{u1, 'q_f64', x}
 def q_chr{x} = bcall{u1, 'q_c32', x}
-def q_chr{T,x if T==u8 } = bcall{u1, 'q_c8',  x}
-def q_chr{T,x if T==u16} = bcall{u1, 'q_c16', x}
-def q_chr{T,x if T==u32} = bcall{u1, 'q_c32', x}
+def q_chr{(u8 ),x} = bcall{u1, 'q_c8',  x}
+def q_chr{(u16),x} = bcall{u1, 'q_c16', x}
+def q_chr{(u32),x} = bcall{u1, 'q_c32', x}
 
 def cbqn_c32Tag{} = emit{u64, '', 'C32_TAG'}
 def cbqn_tagTag{} = emit{u64, '', 'TAG_TAG'}
@@ -21,14 +21,14 @@ def cbqn_nspTag{} = emit{u64, '', 'NSP_TAG'}
 def cbqn_objTag{} = emit{u64, '', 'OBJ_TAG'}
 def cbqn_arrTag{} = emit{u64, '', 'ARR_TAG'}
 
-def cbqn_elType{T if T==u1 } = 0
-def cbqn_elType{T if T==i8 } = 1
-def cbqn_elType{T if T==i16} = 2
-def cbqn_elType{T if T==i32} = 3
-def cbqn_elType{T if T==f64} = 4
-def cbqn_elType{T if T==u8 } = 5
-def cbqn_elType{T if T==u16} = 6
-def cbqn_elType{T if T==u32} = 7
+def cbqn_elType{(u1 )} = 0
+def cbqn_elType{(i8 )} = 1
+def cbqn_elType{(i16)} = 2
+def cbqn_elType{(i32)} = 3
+def cbqn_elType{(f64)} = 4
+def cbqn_elType{(u8 )} = 5
+def cbqn_elType{(u16)} = 6
+def cbqn_elType{(u32)} = 7
 
 def cbqn_tyArrOffset{} = emit{u64, 'offsetof', 'TyArr', 'a'}
 
diff --git a/src/singeli/src/cmp.singeli b/src/singeli/src/cmp.singeli
index 04d673d6..0d58ae1c 100644
--- a/src/singeli/src/cmp.singeli
+++ b/src/singeli/src/cmp.singeli
@@ -14,7 +14,7 @@ fn cmpIX(dst:(*u64), len:ux, x:(u64), v:(u1)) : void = {
   fillbits{dst, len, v&~nan, x}
 }
 
-def eqne{op} = same{op,__eq}|same{op,__ne}
+def eqne{op} = same{op,__eq} or same{op,__ne}
 
 def pathAS{dst, len, T, op, x if issigned{T}} = {
   def R{f if eqne{op}} = {
diff --git a/src/singeli/src/copy.singeli b/src/singeli/src/copy.singeli
index 450cc1aa..a4bb3e9a 100644
--- a/src/singeli/src/copy.singeli
+++ b/src/singeli/src/copy.singeli
@@ -30,7 +30,7 @@ fn copy{X, R}(x: *void, r: *void, l:u64, xRaw: *void) : void = {
     else emit{void, 'memcpy', r, x, l*(width{X}/8)}
   } else if (R==u64) {
     # show{'R==u64', X, R}
-    assert{((X==u8) | (X==u16)) | (X==u32)}
+    assert{X==u8 or X==u16 or X==u32}
     # TODO could maybe read 256 bits and use unpack to write >256
     @maskedLoop{bulk}(sr in tup{'g',rp}, x in tup{RV,xp} over l) sr{x | RV**(cbqn_c32Tag{}<<48)}
   } else if (X==u1 and R==u1) {
@@ -85,9 +85,9 @@ def gen{p} = {
   def tm = tup{0,   0,   0,    0,    0,    1,   1,    1,    2}
   each{{tx0,nx,mx} => {
     each{{tr0,nr,mr} => {
-      if ((mx==mr or mx==2 or mr==2) and (if (mx==2) mr==1; else 1)) {
-        def tr = if (mx==0 and mr==2) f64; else if (tx0==tr0 and mx==1) ty_s{tx0}; else tr0
-        def tx = if (mr==0 and mx==2) f64; else if (tx0==tr0 and mx==1) ty_s{tx0}; else tx0
+      if ((mx==mr or mx==2 or mr==2) and (if (mx==2) mr==1 else 1)) {
+        def tr = if (mx==0 and mr==2) f64 else if (tx0==tr0 and mx==1) ty_s{tx0} else tr0
+        def tx = if (mr==0 and mx==2) f64 else if (tx0==tr0 and mx==1) ty_s{tx0} else tx0
         export{merge{p, nx, '_', nr}, copy{tx, tr}}
       }
     }, ts, tn, tm}
diff --git a/src/singeli/src/dyarith.singeli b/src/singeli/src/dyarith.singeli
index 768b81c4..7926f00f 100644
--- a/src/singeli/src/dyarith.singeli
+++ b/src/singeli/src/dyarith.singeli
@@ -108,11 +108,11 @@ def runner{u, R, F} = {
 
 # homAny, topAny already give masked vals; anyne doesn't, and ~andAllZero assumes no masking
 def runChecks_any{F, vals} = { F{tree_fold{|, each{select{.,1}, vals}}} }
-def runChecks{type=='homAny', vals, M} = runChecks_any{homAny, vals}
-def runChecks{type=='topAny', vals, M} = runChecks_any{topAny, vals}
-def runChecks{type=='none',   vals, M} = 0
-def runChecks{type=='~andAllZero', vals, M if ~M{0}} = ~tree_fold{&, each{andAllZero, ...slice{flip{vals}, 1}}}
-def runChecks{type=='anyne',  vals, M} = {
+def runChecks{('homAny'), vals, M} = runChecks_any{homAny, vals}
+def runChecks{('topAny'), vals, M} = runChecks_any{topAny, vals}
+def runChecks{('none'),   vals, M} = 0
+def runChecks{('~andAllZero'), vals, M if ~M{0}} = ~tree_fold{&, each{andAllZero, ...slice{flip{vals}, 1}}}
+def runChecks{('anyne'),  vals, M} = {
   def i{vals} = {
     def {_,xs,ys} = flip{vals}
     assert{M{0} == 0}
diff --git a/src/singeli/src/mask.singeli b/src/singeli/src/mask.singeli
index 11433d84..d9979b7d 100644
--- a/src/singeli/src/mask.singeli
+++ b/src/singeli/src/mask.singeli
@@ -10,9 +10,9 @@ mask256:*i64 = merge{4 ** -1,  4 ** 0}
 local def maskOfImpl{T, n, w} = load{*ty_u{T} ~~ (*u8~~mask256 + 32 - n*(elwidth{T}/8))}
 
 # get homogeneous mask of first n items; 0 ≤ n ≤ vcount{T}
-def maskOf{T,n if width{T}==256} = maskOfImpl{T, n, 256}
-def maskOf{T,n if width{T}==128} = maskOfImpl{T, n, 128}
-def maskOf{T,n if width{T}== 64} = maskOfImpl{T, n,  64}
+def maskOf{T,n if w256{T}} = maskOfImpl{T, n, 256}
+def maskOf{T,n if w128{T}} = maskOfImpl{T, n, 128}
+def maskOf{T,n if  w64{T}} = maskOfImpl{T, n,  64}
 
 def anyne{x:T, y:T, M if M{0}==0 and isvec{T}} = ~homAll{x==y}
 def anyne{x:T, y:T, M if M{0}==1 and isvec{T}} =  homAny{M{x!=y}}
@@ -28,16 +28,16 @@ def anynePositive{x:T, y:T, M if M{0}==1 and isvec{T}} = {
 }
 
 def maskNone{x} = x
-def maskNone{x, mode=='all bits zeroes'} = andAllZero{x, x}
+def maskNone{x, ('all bits zeroes')} = andAllZero{x, x}
 def maskAfter{n} = {
+  def mask{x:X, ('all bits zeroes')} = andAllZero{x, X~~maskOfBit{X,n}}
+  def mask{X, ('to sign bits')} = maskOf{X,n}
+  def mask{X, ('to homogeneous bits')} = maskOf{X,n}
+  def mask{('count')} = n
+  def mask{{x}} = tup{mask{x}}
   def mask{x:X if isvec{X}} = x & (X~~maskOf{X,n})
   def mask{x:X if anyInt{x}} = x & ((1<<n) - 1)
-  def mask{x:X, mode=='all bits zeroes'} = andAllZero{x, X~~maskOfBit{X,n}}
-  def mask{X, mode=='to sign bits'} = maskOf{X,n}
-  def mask{X, mode=='to homogeneous bits'} = maskOf{X,n}
-  def mask{mode=='count'} = n
-  def mask{{x}} = tup{mask{x}}
-  def mask{x==0} = 1
+  def mask{(0)} = 1
 }
 
 
@@ -69,7 +69,7 @@ def storeBatch{ptr:P, ns, xs, M if istup{ns}} = each{{n,x} => storeBatch{ptr, n,
 
 
 # "harmless" pointer cast that'll only cast void*
-def hCast{T,p} = assert{show{'expected pointer with element',T,'or void but got ',p}}
+def hCast{T,p} = assert{0, 'expected pointer with element',T,'or void but got ',p}
 def hCast{T,p:*T} = p
 def hCast{T,p:(*void)} = *T~~p
 
@@ -123,7 +123,6 @@ def maskedLoopPositive{bulk}{vars,begin==0,end:L,iter} = {
 #    index given is a tuple of batch indexes to process
 def muLoop{bulk, unr, fromunr}{vars,begin==0,end,iter} = {
   l:u64 = end
-  def step = 123123123
   
   m:u64 = l / bulk
   if (unr==1) {
diff --git a/src/singeli/src/neon.singeli b/src/singeli/src/neon.singeli
index 70933be5..2f964841 100644
--- a/src/singeli/src/neon.singeli
+++ b/src/singeli/src/neon.singeli
@@ -1,15 +1,11 @@
-def nvec{T} = 0
-def nvec{T if isvec{T}} = (width{T}==64) | (width{T}==128)
-def nvec{T,w} = 0
-def nvec{T,w if nvec{T}} = elwidth{T}==w
+def nvec{T} = isvec{T} and (width{T}==64 or width{T}==128)
+def nvec{T,w} = nvec{T} and elwidth{T}==w
 def nveci = genchk{nvec,      isint}
 def nvecs = genchk{nvec,   issigned}
 def nvecu = genchk{nvec, isunsigned}
 def nvecf = genchk{nvec,    isfloat}
 
 
-def reinterpret{T, v if same{'pointer',typekind{T}} and ktup{v}} = { tmp:T=v }
-
 def nty{T} = {
   def q = quality{T}
   merge{if (q=='i') 's' else q, fmtnat{width{T}}}
@@ -32,7 +28,7 @@ def  shlm{a:T, s, d:T if nvecu{T}} = emit{T, ntyp{'vsli', '_n', T}, d, a, s} # (
 def bitBlend{f:T, t:T, m:M if nvec{T} and nvecu{M,elwidth{T}} and width{T}==width{M}} = emit{T, ntyp{'vbsl', T}, m, t, f}
 def homBlend{f:T, t:T, m:M if nvec{M}} = bitBlend{f, t, m}
 
-def addpw {     x:T if nveci{T} and elwidth{T}<=32             } = emit{el_m{T}, ntyp{'vpaddl', T},    x} # add pairwise widening
+def addpw {     x:T if nveci{T} and elwidth{T}<=32               } = emit{el_m{T}, ntyp{'vpaddl', T},    x} # add pairwise widening
 def addpwa{a:D, x:T if nveci{T} and elwidth{T}<=32 and D==el_m{T}} = emit{D,       ntyp{'vpadal', T}, a, x} # add pairwise widening + accumulate
 def mla{a:T, x:T, y:T if nvec{T}} = emit{T, ntyp{'vmla', T}, a, x, y} # a + x*y
 def mls{a:T, x:T, y:T if nvec{T}} = emit{T, ntyp{'vmls', T}, a, x, y} # a - x*y
@@ -67,8 +63,8 @@ def loadLow{ptr:P, w if w==elwidth{P}} = load{ptr}
 
 
 def undefPromote{T, x:X if w64{X} and w128{T} and eltype{T}==eltype{X}} = emit{T, ntyp{'vcombine', X}, x, x} # arm_neon.h doesn't actually provide a way to do this in a 0-instruction way. ¯\_(ツ)_/¯
-def half{x:T, n==0 if w128{T}} = emit{n_h{T}, ntyp0{'vget_low',  T}, x}
-def half{x:T, n==1 if w128{T}} = emit{n_h{T}, ntyp0{'vget_high', T}, x}
+def half{x:T, (0) if w128{T}} = emit{n_h{T}, ntyp0{'vget_low',  T}, x}
+def half{x:T, (1) if w128{T}} = emit{n_h{T}, ntyp0{'vget_high', T}, x}
 def pair{a:T, b:T if w64{T}} = emit{n_d{T}, ntyp0{'vcombine', T}, a, b}
 def copyLane{dst:D, di, src:S, si if w64{D}  and nvec{S} and eltype{D}==eltype{S}} = emit{D, ntyp{'vcopy_lane', S}, dst, di, src, si}
 def copyLane{dst:D, di, src:S, si if w128{D} and nvec{S} and eltype{D}==eltype{S}} = emit{D, ntyp{'vcopyq_lane', S}, dst, di, src, si}
@@ -125,7 +121,7 @@ def homAll{x:T if nvec{T}} = bitAll{x}
 def homMask{x:T if nvecu{T} and elwidth{T}>=vcount{T}} = {
   truncBits{vcount{T}, fold_add{x & make{T, 1<<iota{vcount{T}}}}}
 }
-def homMask{x:T if nvecu{T} and T==[16]u8} = {
+def homMask{x:T if T==[16]u8} = {
   t:= [8]u16~~sel{[16]u8, x, make{[16]u8, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15}}
   fold_add{t & make{[8]u16, (1<<iota{8})*0x0101}}
 }
diff --git a/src/singeli/src/replicate.singeli b/src/singeli/src/replicate.singeli
index bc1c6dea..444b2ab6 100644
--- a/src/singeli/src/replicate.singeli
+++ b/src/singeli/src/replicate.singeli
@@ -170,7 +170,7 @@ if_inline (hasarch{'AVX2'}) {
   
   def rep_const_shuffle{V, wv, xv:*V, rv:*V, n:(u64)} = rep_const_shuffle{V, wv, get_rep_iter{V, wv}, xv, rv, n}
   
-} else if (hasarch{'AARCH64'}) {
+} else if_inline (hasarch{'AARCH64'}) {
   
   def rep_iter_from_sh{sh}{x, gen} = {
     each{{s} => gen{sel{[16]u8, x, s}}, sh}
diff --git a/src/singeli/src/scan.singeli b/src/singeli/src/scan.singeli
index b6719273..c3f6f628 100644
--- a/src/singeli/src/scan.singeli
+++ b/src/singeli/src/scan.singeli
@@ -146,7 +146,7 @@ fn bcs{T if hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = {
   c:= V**0
   
   def ii32 = iota{32}; def bit{k}=bit{k,ii32}; def tail{k}=tail{k,ii32}
-  def sums{n} = (if (n==0) tup{0}; else { def s=sums{n-1}; merge{s,s+1} })
+  def sums{n} = (if (n==0) tup{0} else { def s=sums{n-1}; merge{s,s+1} })
   def widen{v:T} = unpackQ{shuf{[4]u64, v, 4b3120}, T**0}
   
   def sumlanes{x:(u32)} = {
diff --git a/src/singeli/src/scan_common.singeli b/src/singeli/src/scan_common.singeli
index 0d861f2e..b6c444a5 100644
--- a/src/singeli/src/scan_common.singeli
+++ b/src/singeli/src/scan_common.singeli
@@ -15,7 +15,7 @@ def zip{up, x} = (if (up) zipHi else zipLo){x,x}
 def spread{a:VT, ...up} = {
   def w = elwidth{VT}
   def b = w/8
-  if (w<=16) sel8{a,merge{iota{12},(16-b)+iota{4}%b}, ...up}; else a
+  if (w<=16) sel8{a,merge{iota{12},(16-b)+iota{4}%b}, ...up} else a
 }
 
 # Set all elements with the last element of the input
diff --git a/src/singeli/src/search.singeli b/src/singeli/src/search.singeli
index b3f9f3de..6770f5da 100644
--- a/src/singeli/src/search.singeli
+++ b/src/singeli/src/search.singeli
@@ -60,14 +60,14 @@ fn copyOrdered{}(r:*f64, x:*f64, len:u64) : u1 = {
   0
 }
 
-if_inline (hasarch{'X86_64'} | hasarch{'AARCH64'}) {
+(if (has_simd) {
   export{'simd_search_u8',  searchOne{u64, u8}}
   export{'simd_search_u16', searchOne{u64, u16}}
   export{'simd_search_u32', searchOne{u64, u32}}
   export{'simd_search_f64', searchOne{f64, f64}}
   export{'simd_search_normalizable', searchNormalizable{}}
   export{'simd_copy_ordered', copyOrdered{}}
-}
+})
 
 
 # In-register bit table
diff --git a/src/singeli/src/select.singeli b/src/singeli/src/select.singeli
index 983e4e2b..9d5799e3 100644
--- a/src/singeli/src/select.singeli
+++ b/src/singeli/src/select.singeli
@@ -37,7 +37,7 @@ def shuf_select{ri, rd, TI, w, r, wl, xl, selx} = {
   xlf:= VI**cast_i{TI, xl}
   @maskedLoop{ri}(cw0 in tup{VI,w}, M in 'm' over i to wl) {
     cw:= wrapChk{cw0, VI,xlf, M}
-    is:= (if (ext>1) i<<lb{ext}; else i)
+    is:= (if (ext>1) i<<lb{ext} else i)
     def se{e, c, o} = {
       c2:= shuf{[4]u64, c+c, 4b3120}
       each{
@@ -57,7 +57,7 @@ def perm_select{ri, rd, TI, w, r, wl, xl, selx} = {
   xlf:= VI**cast_i{TI, xl}
   @maskedLoop{ri}(cw0 in tup{VI,w}, M in 'm' over i to wl) {
     cw:= wrapChk{cw0, VI,xlf, M}
-    is:= (if (ext>1) i<<lb{ext}; else i)
+    is:= (if (ext>1) i<<lb{ext} else i)
     def part{o} = widen{[8]i32, re_el{i8, shuf{[4]u64, cw, 4b3210+o}}}
     def se{o} = storeExp{r, is+o, selx{part{o}}, M, ext, rd, wl}
     each{se, iota{ext}}
@@ -67,7 +67,7 @@ def perm_select{ri, rd, TI, w, r, wl, xl, selx} = {
 def makeselx{VI, VD, nsel, xd, logv, cshuf} = {
   def bblend {m}{{f,t}} = homBlend{f, t, type{f} ~~ m}
   def bblendn{m}{{t,f}} = bblend{m}{tup{f,t}}
-  def bb{c}{f, v} = (if (f) bblendn{c<v}; else bblend{(c&v)==v})
+  def bb{c}{f, v} = (if (f) bblendn{c<v} else bblend{(c&v)==v})
   
   def bs{b, c, x} = cshuf{x, c}
   def bs{b, c, x if length{b}>0} = {
diff --git a/src/singeli/src/squeeze.singeli b/src/singeli/src/squeeze.singeli
index 2d16a10f..8e63d014 100644
--- a/src/singeli/src/squeeze.singeli
+++ b/src/singeli/src/squeeze.singeli
@@ -124,7 +124,7 @@ fn squeeze{vw, X, CHR, B}(x0:*void, len:ux) : u32 = {
             }
             int
           }
-          def acc = { if (length{is}==2) r2; else r1 }
+          def acc = { if (length{is}==2) r2 else r1 }
           
           acc|= M{getAcc{type{acc}, int}}
         }