diff options
-rw-r--r-- | src/singeli/src/avx.singeli | 4 | ||||
-rw-r--r-- | src/singeli/src/avx2.singeli | 14 | ||||
-rw-r--r-- | src/singeli/src/base.singeli | 69 | ||||
-rw-r--r-- | src/singeli/src/bins.singeli | 4 | ||||
-rw-r--r-- | src/singeli/src/bmi2.singeli | 8 | ||||
-rw-r--r-- | src/singeli/src/cbqnDefs.singeli | 22 | ||||
-rw-r--r-- | src/singeli/src/cmp.singeli | 2 | ||||
-rw-r--r-- | src/singeli/src/copy.singeli | 8 | ||||
-rw-r--r-- | src/singeli/src/dyarith.singeli | 10 | ||||
-rw-r--r-- | src/singeli/src/mask.singeli | 23 | ||||
-rw-r--r-- | src/singeli/src/neon.singeli | 16 | ||||
-rw-r--r-- | src/singeli/src/replicate.singeli | 2 | ||||
-rw-r--r-- | src/singeli/src/scan.singeli | 2 | ||||
-rw-r--r-- | src/singeli/src/scan_common.singeli | 2 | ||||
-rw-r--r-- | src/singeli/src/search.singeli | 4 | ||||
-rw-r--r-- | src/singeli/src/select.singeli | 6 | ||||
-rw-r--r-- | src/singeli/src/squeeze.singeli | 2 |
17 files changed, 92 insertions, 106 deletions
diff --git a/src/singeli/src/avx.singeli b/src/singeli/src/avx.singeli index d748be9d..bb0d9337 100644 --- a/src/singeli/src/avx.singeli +++ b/src/singeli/src/avx.singeli @@ -25,8 +25,8 @@ def rcpE{a:([8]f32)} = emit{[8]f32, '_mm256_rcp_ps', a} # conversion def half{x:T, i if w256{T} and knum{i}} = n_h{T} ~~ emit{[8]i16, '_mm256_extracti128_si256', v2i{x}, i} -def half{x:T, i==0 if w256{T}} = n_h{T} ~~ emit{[8]i16, '_mm256_castsi256_si128', v2i{x}} -def pair{a:T,b:T if width{T}==128} = n_d{T} ~~ emit{[8]i32, '_mm256_setr_m128i', a, b} +def half{x:T, (0) if w256{T}} = n_h{T} ~~ emit{[8]i16, '_mm256_castsi256_si128', v2i{x}} +def pair{a:T,b:T if w128{T}} = n_d{T} ~~ emit{[8]i32, '_mm256_setr_m128i', a, b} def widen{T==[4]f64, x:X if X==[4]i32} = emit{T, '_mm256_cvtepi32_pd', x} def widen{T==[4]f64, x:X if X==[4]f32} = emit{T, '_mm256_cvtps_pd', x} diff --git a/src/singeli/src/avx2.singeli b/src/singeli/src/avx2.singeli index a5ed5721..8623ea36 100644 --- a/src/singeli/src/avx2.singeli +++ b/src/singeli/src/avx2.singeli @@ -21,12 +21,12 @@ def mul32{a:T,b:T if [ 4]u64==T} = emit{T, '_mm256_mul_epu32', a, b} # reads o # structural operations -def shl{S==[16]u8, x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bslli_epi128', x, n} -def shr{S==[16]u8, x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bsrli_epi128', x, n} +def shl{([16]u8), x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bslli_epi128', x, n} +def shr{([16]u8), x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bsrli_epi128', x, n} -def blend{L==[8]u16, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[16]i16, '_mm256_blend_epi16', v2i{a}, v2i{b}, m} -def blend{L==[8]u32, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 8]i32, '_mm256_blend_epi32', v2i{a}, v2i{b}, m} -def blend{L==[4]u64, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 4]f64, '_mm256_blend_pd', v2d{a}, v2d{b}, m} +def blend{([8]u16), a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[16]i16, '_mm256_blend_epi16', v2i{a}, v2i{b}, m} +def blend{([8]u32), a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 8]i32, '_mm256_blend_epi32', v2i{a}, v2i{b}, m} +def blend{([4]u64), a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 4]f64, '_mm256_blend_pd', v2d{a}, v2d{b}, m} def topBlend{f:T, t:T, m:M if w256{T, 8} and w256i{M, 8}} = T ~~ emit{[32]i8, '_mm256_blendv_epi8', v2i{f}, v2i{t}, v2i{m}} def homBlend{f:T, t:T, m:M if w256{T, 8} and w256i{M, 8}} = topBlend{f, t, m} @@ -93,5 +93,5 @@ def narrow{T, x:X if w256u{X,64} and T==u16} = re_el{T, sel{[16]i8, narrow{u32,x def narrow{T, x:X if w256u{X,64} and T== u8} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, 4*iota{32}}}} -def cvt2{T, x:X if T==i32 and X==[4]f64} = emit{[4]i32, '_mm256_cvtpd_epi32', x} -def cvt2{T, x:X if T==f64 and X==[4]i32} = emit{[4]f64, '_mm256_cvtepi32_pd', x} +def cvt2{(i32), x:X if X==[4]f64} = emit{[4]i32, '_mm256_cvtpd_epi32', x} +def cvt2{(f64), x:X if X==[4]i32} = emit{[4]f64, '_mm256_cvtepi32_pd', x} diff --git a/src/singeli/src/base.singeli b/src/singeli/src/base.singeli index b8fe0710..62c92cd4 100644 --- a/src/singeli/src/base.singeli +++ b/src/singeli/src/base.singeli @@ -16,11 +16,11 @@ def isconst = kcon def istype = ktyp def istup = ktup -def isunsigned{T} = isint{T} & ~issigned{T} +def isunsigned{T} = isint{T} and not issigned{T} -def isvec {T} = 0; def isvec {T if istype{T}} = same{typekind{T},'vector'} -def isprim{T} = 0; def isprim{T if istype{T}} = same{typekind{T},'primitive'} -def isptr {T} = 0; def isptr {T if istype{T}} = same{typekind{T},'pointer'} +def isvec {T} = istype{T} and same{typekind{T},'vector'} +def isprim{T} = istype{T} and same{typekind{T},'primitive'} +def isptr {T} = istype{T} and same{typekind{T},'pointer'} def elwidth{T} = width{eltype{T}} oper &~ andnot infix none 35 @@ -40,16 +40,17 @@ def loadu{p:T if elwidth{T}==8} = load{p} def storeu{p:T, v:(eltype{T}) if elwidth{T}==8} = store{p, v} -def reinterpret{T, x:X if T==X} = x +def reinterpret{T, x:T} = x def exportN{f, ...ns} = each{export{.,f}, ns} -def exportT{name, fs} = { v:*type{select{fs,0}} = fs; export{name, v} } +def exportT{name, fs} = { v:*oneType{fs} = fs; export{name, v} } # hints def rare{x if knum{x}} = x def rare{x:(u1)} = emit{u1, '__builtin_expect', x, 0} -def assert{x if x==0} = assert{'failed assertion'} -def assert{x if x==1} = 1 +def assert{c, ...msg} = { if (not same{c,1}) { show{...msg}; 0{} } } +def assert{(0)} = assert{0, 'failed assertion'} +def assert{(1)} = 1 def unreachable{} = emit{void, 'si_unreachable'} def assert{x:(u1)} = { if (not x) emit{void, 'si_unreachable'} } @@ -61,28 +62,21 @@ def oneVal{{h, ...t}} = { def oneVal{{}} = {} def oneType{x} = oneVal{each{type, x}} -def anyNum{x} = isconst{x} | knum{x} +def anyNum{x} = knum{x} def anyNum{x:T} = isprim{T} -def anyInt{x} = 0 -def anyInt{x if knum{x}} = (x>>0) == x -def anyInt{x if isreg{x}|isconst{x}} = isint{x} +def anyInt{x} = knum{x} and (x>>0) == x +def anyInt{x:T} = isint{T} # vector width/type checks -def w64 {T} = 0; def w64 {T if isvec{T}} = width{T}==64 -def w128{T} = 0; def w128{T if isvec{T}} = width{T}==128 -def w256{T} = 0; def w256{T if isvec{T}} = width{T}==256 -def w64 {T,w} = 0; def w64 {T,w if w64{T}} = elwidth{T}==w -def w128{T,w} = 0; def w128{T,w if w128{T}} = elwidth{T}==w -def w256{T,w} = 0; def w256{T,w if w256{T}} = elwidth{T}==w +def w64 {T} = isvec{T} and width{T}==64; def w64 {T,w} = w64{T} and elwidth{T}==w +def w128{T} = isvec{T} and width{T}==128; def w128{T,w} = w128{T} and elwidth{T}==w +def w256{T} = isvec{T} and width{T}==256; def w256{T,w} = w256{T} and elwidth{T}==w # width+type checks def genchk{B, F} = { - def r{T} = 0 - def r{T if B{T}} = F{eltype{T}} - def r{T,w} = 0 - def r{T,w if B{T}} = F{eltype{T}} & (elwidth{T}==w) - def r{T if ~isvec{T}} = 0 + def r{T } = B{T} and F{eltype{T}} + def r{T,w} = B{T} and F{eltype{T}} and elwidth{T}==w r } def w256i = genchk{w256, isint}; def w128i = genchk{w128, isint}; def w64i = genchk{w64, isint} @@ -95,10 +89,7 @@ def w256f = genchk{w256, isfloat}; def w128f = genchk{w128, isfloat}; de def trunc{T, x:U if isint{T} and isint{U} and T<=U} = emit{T, '', x} def trunc{T, x if knum{x}} = cast{T, x} -def tern{c, T, F if anyInt{c}} = { - if(c) T - else F -} +def tern{c, T, F if anyInt{c}} = if(c) T else F def tern{c, t:T, f:T if anyInt{c}} = { res:T = f if (c) res = t @@ -151,18 +142,18 @@ def collect{vars,begin,end,iter if knum{begin} and knum{end}} = { } # convert tuple to number in little-endian base b -def base{b,l} = if (0==length{l}) 0 else select{l,0}+b*base{b,slice{l,1}} +def base{b,{}} = 0 +def base{b,{h,...t}} = h + b*base{b,t} # vector definitions def arch_defvw = if (hasarch{'AVX2'}) 256 else 128 -def has_simd = hasarch{'X86_64'} | hasarch{'AARCH64'} -def fast_BMI2{} = if (SLOW_PDEP) 0 else hasarch{'BMI2'} +def has_simd = hasarch{'X86_64'} or hasarch{'AARCH64'} +def fast_BMI2{} = hasarch{'BMI2'} and not SLOW_PDEP # test if vector has a specific width & element type -def lvec{T, n, w} = 0 -def lvec{T, n, w if isvec{T} and vcount{T}==n and elwidth{T}==w} = 1 +def lvec{T, n, w} = isvec{T} and vcount{T}==n and elwidth{T}==w # base cases def { @@ -235,10 +226,9 @@ def popc{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_popcount', x} def ctz{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_ctzll', x} def ctz{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_ctz', x} def clz{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_clzll', x} -def clz{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_clz', x} +def clz{x:T if isint{T} and width{T}==32} = emit{ux, '__builtin_clz', x} # count-leading-zeros complement, less type-dependent -def clzc{x:T if isint{T} and width{T}==64} = 64-clz{x} -def clzc{x:T if isint{T} and width{T}<=32} = 32-clz{x} +def clzc{x:T if isint{T}} = width{T} - clz{x} def ceil_log2{n} = clzc{n-1} @@ -249,7 +239,7 @@ def truncBits{n, v if n==64} = cast_i{u64, v} # base-2 log of a constant power of two def lb{n if knum{n} and (n>>1<<1) == n and n>0} = lb{n>>1}+1 -def lb{n==1} = 0 +def lb{(1)} = 0 def zlow{n,x} = (x >> n) << n # zero out n least significant bits def tail{n,x} = x & ((1<<n) - 1) # get the n least significant bits @@ -292,8 +282,9 @@ def forUnroll{exp,unr}{vars,begin,end,iter} = { iter{each{{j}=>i+j, iota{unr}}, vars} i+= unr } - if (unr==2) { if (i!=end) iter{tup{i}, vars} } - else if (unr>1) { + if (unr==2) { + if (i!=end) iter{tup{i}, vars} + } else if (unr>1) { if (exp) { def stop = makelabel{} each{{j} => { @@ -313,7 +304,7 @@ def makeBranch{Ts, F} = { def start = setlabel{} F{...args} setlabel{skip} - {...vs} => { each{=, args, vs}; goto{start} } + {...vs} => { args = vs; goto{start} } } def makeOptBranch{enable, Ts, F} = if (enable) makeBranch{Ts, F} else 'not defined' diff --git a/src/singeli/src/bins.singeli b/src/singeli/src/bins.singeli index 128ab8bb..8273f5d9 100644 --- a/src/singeli/src/bins.singeli +++ b/src/singeli/src/bins.singeli @@ -295,7 +295,7 @@ def bin_search_vec{prim, T, w:*T, wn, x:*T, xn, rp, maxwn if hasarch{'AVX2'}} = } } -if_inline (hasarch{'AVX2'}) { +(if (hasarch{'AVX2'}) { fn avx2_search_bin{prim, T, maxwn}(rp:*(if (prim=='∊') u64 else i8), w:*void, wn:u64, x:*void, xn:u64) : void = { bin_search_vec{prim, T, *T~~w, wn, *T~~x, xn, rp, maxwn} } @@ -307,7 +307,7 @@ if_inline (hasarch{'AVX2'}) { 'avx2_indexOf_sort', each{avx2_search_bin{'⊐',.,.}, tup{i8,i16,i32}, tup{64,16,16}} } -} +}) def unroll_sizes = tup{4,1} fn write{T,k}(r:*void, i:u64, ...vs:k**u64) : void = { diff --git a/src/singeli/src/bmi2.singeli b/src/singeli/src/bmi2.singeli index 640ca3bd..e26237ed 100644 --- a/src/singeli/src/bmi2.singeli +++ b/src/singeli/src/bmi2.singeli @@ -1,4 +1,4 @@ -def pdep{x:(u64), m:(u64)} = emit{u64, '_pdep_u64', x, m} -def pdep{x:(u32), m:(u32)} = emit{u32, '_pdep_u32', x, m} -def pext{x:(u64), m:(u64)} = emit{u64, '_pext_u64', x, m} -def pext{x:(u32), m:(u32)} = emit{u32, '_pext_u32', x, m} +def pdep{x:T, m:T if T==u64} = emit{T, '_pdep_u64', x, m} +def pdep{x:T, m:T if T==u32} = emit{T, '_pdep_u32', x, m} +def pext{x:T, m:T if T==u64} = emit{T, '_pext_u64', x, m} +def pext{x:T, m:T if T==u32} = emit{T, '_pext_u32', x, m} diff --git a/src/singeli/src/cbqnDefs.singeli b/src/singeli/src/cbqnDefs.singeli index cac96e91..962f1da4 100644 --- a/src/singeli/src/cbqnDefs.singeli +++ b/src/singeli/src/cbqnDefs.singeli @@ -5,9 +5,9 @@ def from_B{T, x if T<=u32 and isunsigned{T}} = bcall{T, 'o2cG', x} def q_f64{x} = bcall{u1, 'q_f64', x} def q_chr{x} = bcall{u1, 'q_c32', x} -def q_chr{T,x if T==u8 } = bcall{u1, 'q_c8', x} -def q_chr{T,x if T==u16} = bcall{u1, 'q_c16', x} -def q_chr{T,x if T==u32} = bcall{u1, 'q_c32', x} +def q_chr{(u8 ),x} = bcall{u1, 'q_c8', x} +def q_chr{(u16),x} = bcall{u1, 'q_c16', x} +def q_chr{(u32),x} = bcall{u1, 'q_c32', x} def cbqn_c32Tag{} = emit{u64, '', 'C32_TAG'} def cbqn_tagTag{} = emit{u64, '', 'TAG_TAG'} @@ -21,14 +21,14 @@ def cbqn_nspTag{} = emit{u64, '', 'NSP_TAG'} def cbqn_objTag{} = emit{u64, '', 'OBJ_TAG'} def cbqn_arrTag{} = emit{u64, '', 'ARR_TAG'} -def cbqn_elType{T if T==u1 } = 0 -def cbqn_elType{T if T==i8 } = 1 -def cbqn_elType{T if T==i16} = 2 -def cbqn_elType{T if T==i32} = 3 -def cbqn_elType{T if T==f64} = 4 -def cbqn_elType{T if T==u8 } = 5 -def cbqn_elType{T if T==u16} = 6 -def cbqn_elType{T if T==u32} = 7 +def cbqn_elType{(u1 )} = 0 +def cbqn_elType{(i8 )} = 1 +def cbqn_elType{(i16)} = 2 +def cbqn_elType{(i32)} = 3 +def cbqn_elType{(f64)} = 4 +def cbqn_elType{(u8 )} = 5 +def cbqn_elType{(u16)} = 6 +def cbqn_elType{(u32)} = 7 def cbqn_tyArrOffset{} = emit{u64, 'offsetof', 'TyArr', 'a'} diff --git a/src/singeli/src/cmp.singeli b/src/singeli/src/cmp.singeli index 04d673d6..0d58ae1c 100644 --- a/src/singeli/src/cmp.singeli +++ b/src/singeli/src/cmp.singeli @@ -14,7 +14,7 @@ fn cmpIX(dst:(*u64), len:ux, x:(u64), v:(u1)) : void = { fillbits{dst, len, v&~nan, x} } -def eqne{op} = same{op,__eq}|same{op,__ne} +def eqne{op} = same{op,__eq} or same{op,__ne} def pathAS{dst, len, T, op, x if issigned{T}} = { def R{f if eqne{op}} = { diff --git a/src/singeli/src/copy.singeli b/src/singeli/src/copy.singeli index 450cc1aa..a4bb3e9a 100644 --- a/src/singeli/src/copy.singeli +++ b/src/singeli/src/copy.singeli @@ -30,7 +30,7 @@ fn copy{X, R}(x: *void, r: *void, l:u64, xRaw: *void) : void = { else emit{void, 'memcpy', r, x, l*(width{X}/8)} } else if (R==u64) { # show{'R==u64', X, R} - assert{((X==u8) | (X==u16)) | (X==u32)} + assert{X==u8 or X==u16 or X==u32} # TODO could maybe read 256 bits and use unpack to write >256 @maskedLoop{bulk}(sr in tup{'g',rp}, x in tup{RV,xp} over l) sr{x | RV**(cbqn_c32Tag{}<<48)} } else if (X==u1 and R==u1) { @@ -85,9 +85,9 @@ def gen{p} = { def tm = tup{0, 0, 0, 0, 0, 1, 1, 1, 2} each{{tx0,nx,mx} => { each{{tr0,nr,mr} => { - if ((mx==mr or mx==2 or mr==2) and (if (mx==2) mr==1; else 1)) { - def tr = if (mx==0 and mr==2) f64; else if (tx0==tr0 and mx==1) ty_s{tx0}; else tr0 - def tx = if (mr==0 and mx==2) f64; else if (tx0==tr0 and mx==1) ty_s{tx0}; else tx0 + if ((mx==mr or mx==2 or mr==2) and (if (mx==2) mr==1 else 1)) { + def tr = if (mx==0 and mr==2) f64 else if (tx0==tr0 and mx==1) ty_s{tx0} else tr0 + def tx = if (mr==0 and mx==2) f64 else if (tx0==tr0 and mx==1) ty_s{tx0} else tx0 export{merge{p, nx, '_', nr}, copy{tx, tr}} } }, ts, tn, tm} diff --git a/src/singeli/src/dyarith.singeli b/src/singeli/src/dyarith.singeli index 768b81c4..7926f00f 100644 --- a/src/singeli/src/dyarith.singeli +++ b/src/singeli/src/dyarith.singeli @@ -108,11 +108,11 @@ def runner{u, R, F} = { # homAny, topAny already give masked vals; anyne doesn't, and ~andAllZero assumes no masking def runChecks_any{F, vals} = { F{tree_fold{|, each{select{.,1}, vals}}} } -def runChecks{type=='homAny', vals, M} = runChecks_any{homAny, vals} -def runChecks{type=='topAny', vals, M} = runChecks_any{topAny, vals} -def runChecks{type=='none', vals, M} = 0 -def runChecks{type=='~andAllZero', vals, M if ~M{0}} = ~tree_fold{&, each{andAllZero, ...slice{flip{vals}, 1}}} -def runChecks{type=='anyne', vals, M} = { +def runChecks{('homAny'), vals, M} = runChecks_any{homAny, vals} +def runChecks{('topAny'), vals, M} = runChecks_any{topAny, vals} +def runChecks{('none'), vals, M} = 0 +def runChecks{('~andAllZero'), vals, M if ~M{0}} = ~tree_fold{&, each{andAllZero, ...slice{flip{vals}, 1}}} +def runChecks{('anyne'), vals, M} = { def i{vals} = { def {_,xs,ys} = flip{vals} assert{M{0} == 0} diff --git a/src/singeli/src/mask.singeli b/src/singeli/src/mask.singeli index 11433d84..d9979b7d 100644 --- a/src/singeli/src/mask.singeli +++ b/src/singeli/src/mask.singeli @@ -10,9 +10,9 @@ mask256:*i64 = merge{4 ** -1, 4 ** 0} local def maskOfImpl{T, n, w} = load{*ty_u{T} ~~ (*u8~~mask256 + 32 - n*(elwidth{T}/8))} # get homogeneous mask of first n items; 0 ≤ n ≤ vcount{T} -def maskOf{T,n if width{T}==256} = maskOfImpl{T, n, 256} -def maskOf{T,n if width{T}==128} = maskOfImpl{T, n, 128} -def maskOf{T,n if width{T}== 64} = maskOfImpl{T, n, 64} +def maskOf{T,n if w256{T}} = maskOfImpl{T, n, 256} +def maskOf{T,n if w128{T}} = maskOfImpl{T, n, 128} +def maskOf{T,n if w64{T}} = maskOfImpl{T, n, 64} def anyne{x:T, y:T, M if M{0}==0 and isvec{T}} = ~homAll{x==y} def anyne{x:T, y:T, M if M{0}==1 and isvec{T}} = homAny{M{x!=y}} @@ -28,16 +28,16 @@ def anynePositive{x:T, y:T, M if M{0}==1 and isvec{T}} = { } def maskNone{x} = x -def maskNone{x, mode=='all bits zeroes'} = andAllZero{x, x} +def maskNone{x, ('all bits zeroes')} = andAllZero{x, x} def maskAfter{n} = { + def mask{x:X, ('all bits zeroes')} = andAllZero{x, X~~maskOfBit{X,n}} + def mask{X, ('to sign bits')} = maskOf{X,n} + def mask{X, ('to homogeneous bits')} = maskOf{X,n} + def mask{('count')} = n + def mask{{x}} = tup{mask{x}} def mask{x:X if isvec{X}} = x & (X~~maskOf{X,n}) def mask{x:X if anyInt{x}} = x & ((1<<n) - 1) - def mask{x:X, mode=='all bits zeroes'} = andAllZero{x, X~~maskOfBit{X,n}} - def mask{X, mode=='to sign bits'} = maskOf{X,n} - def mask{X, mode=='to homogeneous bits'} = maskOf{X,n} - def mask{mode=='count'} = n - def mask{{x}} = tup{mask{x}} - def mask{x==0} = 1 + def mask{(0)} = 1 } @@ -69,7 +69,7 @@ def storeBatch{ptr:P, ns, xs, M if istup{ns}} = each{{n,x} => storeBatch{ptr, n, # "harmless" pointer cast that'll only cast void* -def hCast{T,p} = assert{show{'expected pointer with element',T,'or void but got ',p}} +def hCast{T,p} = assert{0, 'expected pointer with element',T,'or void but got ',p} def hCast{T,p:*T} = p def hCast{T,p:(*void)} = *T~~p @@ -123,7 +123,6 @@ def maskedLoopPositive{bulk}{vars,begin==0,end:L,iter} = { # index given is a tuple of batch indexes to process def muLoop{bulk, unr, fromunr}{vars,begin==0,end,iter} = { l:u64 = end - def step = 123123123 m:u64 = l / bulk if (unr==1) { diff --git a/src/singeli/src/neon.singeli b/src/singeli/src/neon.singeli index 70933be5..2f964841 100644 --- a/src/singeli/src/neon.singeli +++ b/src/singeli/src/neon.singeli @@ -1,15 +1,11 @@ -def nvec{T} = 0 -def nvec{T if isvec{T}} = (width{T}==64) | (width{T}==128) -def nvec{T,w} = 0 -def nvec{T,w if nvec{T}} = elwidth{T}==w +def nvec{T} = isvec{T} and (width{T}==64 or width{T}==128) +def nvec{T,w} = nvec{T} and elwidth{T}==w def nveci = genchk{nvec, isint} def nvecs = genchk{nvec, issigned} def nvecu = genchk{nvec, isunsigned} def nvecf = genchk{nvec, isfloat} -def reinterpret{T, v if same{'pointer',typekind{T}} and ktup{v}} = { tmp:T=v } - def nty{T} = { def q = quality{T} merge{if (q=='i') 's' else q, fmtnat{width{T}}} @@ -32,7 +28,7 @@ def shlm{a:T, s, d:T if nvecu{T}} = emit{T, ntyp{'vsli', '_n', T}, d, a, s} # ( def bitBlend{f:T, t:T, m:M if nvec{T} and nvecu{M,elwidth{T}} and width{T}==width{M}} = emit{T, ntyp{'vbsl', T}, m, t, f} def homBlend{f:T, t:T, m:M if nvec{M}} = bitBlend{f, t, m} -def addpw { x:T if nveci{T} and elwidth{T}<=32 } = emit{el_m{T}, ntyp{'vpaddl', T}, x} # add pairwise widening +def addpw { x:T if nveci{T} and elwidth{T}<=32 } = emit{el_m{T}, ntyp{'vpaddl', T}, x} # add pairwise widening def addpwa{a:D, x:T if nveci{T} and elwidth{T}<=32 and D==el_m{T}} = emit{D, ntyp{'vpadal', T}, a, x} # add pairwise widening + accumulate def mla{a:T, x:T, y:T if nvec{T}} = emit{T, ntyp{'vmla', T}, a, x, y} # a + x*y def mls{a:T, x:T, y:T if nvec{T}} = emit{T, ntyp{'vmls', T}, a, x, y} # a - x*y @@ -67,8 +63,8 @@ def loadLow{ptr:P, w if w==elwidth{P}} = load{ptr} def undefPromote{T, x:X if w64{X} and w128{T} and eltype{T}==eltype{X}} = emit{T, ntyp{'vcombine', X}, x, x} # arm_neon.h doesn't actually provide a way to do this in a 0-instruction way. ¯\_(ツ)_/¯ -def half{x:T, n==0 if w128{T}} = emit{n_h{T}, ntyp0{'vget_low', T}, x} -def half{x:T, n==1 if w128{T}} = emit{n_h{T}, ntyp0{'vget_high', T}, x} +def half{x:T, (0) if w128{T}} = emit{n_h{T}, ntyp0{'vget_low', T}, x} +def half{x:T, (1) if w128{T}} = emit{n_h{T}, ntyp0{'vget_high', T}, x} def pair{a:T, b:T if w64{T}} = emit{n_d{T}, ntyp0{'vcombine', T}, a, b} def copyLane{dst:D, di, src:S, si if w64{D} and nvec{S} and eltype{D}==eltype{S}} = emit{D, ntyp{'vcopy_lane', S}, dst, di, src, si} def copyLane{dst:D, di, src:S, si if w128{D} and nvec{S} and eltype{D}==eltype{S}} = emit{D, ntyp{'vcopyq_lane', S}, dst, di, src, si} @@ -125,7 +121,7 @@ def homAll{x:T if nvec{T}} = bitAll{x} def homMask{x:T if nvecu{T} and elwidth{T}>=vcount{T}} = { truncBits{vcount{T}, fold_add{x & make{T, 1<<iota{vcount{T}}}}} } -def homMask{x:T if nvecu{T} and T==[16]u8} = { +def homMask{x:T if T==[16]u8} = { t:= [8]u16~~sel{[16]u8, x, make{[16]u8, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15}} fold_add{t & make{[8]u16, (1<<iota{8})*0x0101}} } diff --git a/src/singeli/src/replicate.singeli b/src/singeli/src/replicate.singeli index bc1c6dea..444b2ab6 100644 --- a/src/singeli/src/replicate.singeli +++ b/src/singeli/src/replicate.singeli @@ -170,7 +170,7 @@ if_inline (hasarch{'AVX2'}) { def rep_const_shuffle{V, wv, xv:*V, rv:*V, n:(u64)} = rep_const_shuffle{V, wv, get_rep_iter{V, wv}, xv, rv, n} -} else if (hasarch{'AARCH64'}) { +} else if_inline (hasarch{'AARCH64'}) { def rep_iter_from_sh{sh}{x, gen} = { each{{s} => gen{sel{[16]u8, x, s}}, sh} diff --git a/src/singeli/src/scan.singeli b/src/singeli/src/scan.singeli index b6719273..c3f6f628 100644 --- a/src/singeli/src/scan.singeli +++ b/src/singeli/src/scan.singeli @@ -146,7 +146,7 @@ fn bcs{T if hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = { c:= V**0 def ii32 = iota{32}; def bit{k}=bit{k,ii32}; def tail{k}=tail{k,ii32} - def sums{n} = (if (n==0) tup{0}; else { def s=sums{n-1}; merge{s,s+1} }) + def sums{n} = (if (n==0) tup{0} else { def s=sums{n-1}; merge{s,s+1} }) def widen{v:T} = unpackQ{shuf{[4]u64, v, 4b3120}, T**0} def sumlanes{x:(u32)} = { diff --git a/src/singeli/src/scan_common.singeli b/src/singeli/src/scan_common.singeli index 0d861f2e..b6c444a5 100644 --- a/src/singeli/src/scan_common.singeli +++ b/src/singeli/src/scan_common.singeli @@ -15,7 +15,7 @@ def zip{up, x} = (if (up) zipHi else zipLo){x,x} def spread{a:VT, ...up} = { def w = elwidth{VT} def b = w/8 - if (w<=16) sel8{a,merge{iota{12},(16-b)+iota{4}%b}, ...up}; else a + if (w<=16) sel8{a,merge{iota{12},(16-b)+iota{4}%b}, ...up} else a } # Set all elements with the last element of the input diff --git a/src/singeli/src/search.singeli b/src/singeli/src/search.singeli index b3f9f3de..6770f5da 100644 --- a/src/singeli/src/search.singeli +++ b/src/singeli/src/search.singeli @@ -60,14 +60,14 @@ fn copyOrdered{}(r:*f64, x:*f64, len:u64) : u1 = { 0 } -if_inline (hasarch{'X86_64'} | hasarch{'AARCH64'}) { +(if (has_simd) { export{'simd_search_u8', searchOne{u64, u8}} export{'simd_search_u16', searchOne{u64, u16}} export{'simd_search_u32', searchOne{u64, u32}} export{'simd_search_f64', searchOne{f64, f64}} export{'simd_search_normalizable', searchNormalizable{}} export{'simd_copy_ordered', copyOrdered{}} -} +}) # In-register bit table diff --git a/src/singeli/src/select.singeli b/src/singeli/src/select.singeli index 983e4e2b..9d5799e3 100644 --- a/src/singeli/src/select.singeli +++ b/src/singeli/src/select.singeli @@ -37,7 +37,7 @@ def shuf_select{ri, rd, TI, w, r, wl, xl, selx} = { xlf:= VI**cast_i{TI, xl} @maskedLoop{ri}(cw0 in tup{VI,w}, M in 'm' over i to wl) { cw:= wrapChk{cw0, VI,xlf, M} - is:= (if (ext>1) i<<lb{ext}; else i) + is:= (if (ext>1) i<<lb{ext} else i) def se{e, c, o} = { c2:= shuf{[4]u64, c+c, 4b3120} each{ @@ -57,7 +57,7 @@ def perm_select{ri, rd, TI, w, r, wl, xl, selx} = { xlf:= VI**cast_i{TI, xl} @maskedLoop{ri}(cw0 in tup{VI,w}, M in 'm' over i to wl) { cw:= wrapChk{cw0, VI,xlf, M} - is:= (if (ext>1) i<<lb{ext}; else i) + is:= (if (ext>1) i<<lb{ext} else i) def part{o} = widen{[8]i32, re_el{i8, shuf{[4]u64, cw, 4b3210+o}}} def se{o} = storeExp{r, is+o, selx{part{o}}, M, ext, rd, wl} each{se, iota{ext}} @@ -67,7 +67,7 @@ def perm_select{ri, rd, TI, w, r, wl, xl, selx} = { def makeselx{VI, VD, nsel, xd, logv, cshuf} = { def bblend {m}{{f,t}} = homBlend{f, t, type{f} ~~ m} def bblendn{m}{{t,f}} = bblend{m}{tup{f,t}} - def bb{c}{f, v} = (if (f) bblendn{c<v}; else bblend{(c&v)==v}) + def bb{c}{f, v} = (if (f) bblendn{c<v} else bblend{(c&v)==v}) def bs{b, c, x} = cshuf{x, c} def bs{b, c, x if length{b}>0} = { diff --git a/src/singeli/src/squeeze.singeli b/src/singeli/src/squeeze.singeli index 2d16a10f..8e63d014 100644 --- a/src/singeli/src/squeeze.singeli +++ b/src/singeli/src/squeeze.singeli @@ -124,7 +124,7 @@ fn squeeze{vw, X, CHR, B}(x0:*void, len:ux) : u32 = { } int } - def acc = { if (length{is}==2) r2; else r1 } + def acc = { if (length{is}==2) r2 else r1 } acc|= M{getAcc{type{acc}, int}} } |