diff options
author | Marshall Lochbaum <mwlochbaum@gmail.com> | 2024-05-10 10:26:28 -0400 |
---|---|---|
committer | Marshall Lochbaum <mwlochbaum@gmail.com> | 2024-05-10 10:26:28 -0400 |
commit | b2e3a5ff74d3025614ca0b12073b1a2b8ec90313 (patch) | |
tree | 0d7489051f271ea420b9d92acfaf7bbaf3cff1b4 | |
parent | 4d6612cb16ceeb1a87caba83b2bafb4550526512 (diff) |
Move from & to if/and for Singeli conditions
28 files changed, 523 insertions, 523 deletions
diff --git a/src/singeli/src/avx.singeli b/src/singeli/src/avx.singeli index b4b6894f..d748be9d 100644 --- a/src/singeli/src/avx.singeli +++ b/src/singeli/src/avx.singeli @@ -1,54 +1,54 @@ # compact casting for the annoying intrinsic type system -def v2i{x:T & w256{T}} = if(isint{eltype{T}}) x else [32]u8 ~~ x -def v2f{x:T & w256{T}} = [8]f32 ~~ x -def v2d{x:T & w256{T}} = [4]f64 ~~ x +def v2i{x:T if w256{T}} = if(isint{eltype{T}}) x else [32]u8 ~~ x +def v2f{x:T if w256{T}} = [8]f32 ~~ x +def v2d{x:T if w256{T}} = [4]f64 ~~ x -def undefPromote{T, x:X & w128{X} & w256{T} & eltype{T}==eltype{X}} = T~~emit{[32]u8, '_mm256_castsi128_si256', v2i{x}} +def undefPromote{T, x:X if w128{X} and w256{T} and eltype{T}==eltype{X}} = T~~emit{[32]u8, '_mm256_castsi128_si256', v2i{x}} # load & store -def loadLow{ptr:P, w & w256{eltype{P}} & w<=128} = undefPromote{eltype{P}, loadLow{*n_h{eltype{P}} ~~ ptr, w}} -def loadLow{ptr:P, w & w256{eltype{P}} & w==256} = load{ptr} +def loadLow{ptr:P, w if w256{eltype{P}} and w<=128} = undefPromote{eltype{P}, loadLow{*n_h{eltype{P}} ~~ ptr, w}} +def loadLow{ptr:P, w if w256{eltype{P}} and w==256} = load{ptr} -def storeLow{ptr:P, w, x:T & w256{T} & w<=128} = storeLow{ptr, w, half{x, 0}} -def storeLow{ptr:P, w, x:T & w256{T} & w==256} = store{*T~~ptr, 0, x} +def storeLow{ptr:P, w, x:T if w256{T} and w<=128} = storeLow{ptr, w, half{x, 0}} +def storeLow{ptr:P, w, x:T if w256{T} and w==256} = store{*T~~ptr, 0, x} # float comparison local def f32cmpAVX{a,b,n} = [8]u32 ~~ emit{[8]f32, '_mm256_cmp_ps', a, b, n} local def f64cmpAVX{a,b,n} = [4]u64 ~~ emit{[4]f64, '_mm256_cmp_pd', a, b, n} -def unord{a:T,b:T & T==[8]f32} = f32cmpAVX{a,b,3} -def unord{a:T,b:T & T==[4]f64} = f64cmpAVX{a,b,3} +def unord{a:T,b:T if T==[8]f32} = f32cmpAVX{a,b,3} +def unord{a:T,b:T if T==[4]f64} = f64cmpAVX{a,b,3} # f32 arith def rsqrtE{a:([8]f32)} = emit{[8]f32, '_mm256_rsqrt_ps', a} def rcpE{a:([8]f32)} = emit{[8]f32, '_mm256_rcp_ps', a} # conversion -def half{x:T, i & w256{T} & knum{i}} = n_h{T} ~~ emit{[8]i16, '_mm256_extracti128_si256', v2i{x}, i} -def half{x:T, i==0 & w256{T}} = n_h{T} ~~ emit{[8]i16, '_mm256_castsi256_si128', v2i{x}} -def pair{a:T,b:T & width{T}==128} = n_d{T} ~~ emit{[8]i32, '_mm256_setr_m128i', a, b} +def half{x:T, i if w256{T} and knum{i}} = n_h{T} ~~ emit{[8]i16, '_mm256_extracti128_si256', v2i{x}, i} +def half{x:T, i==0 if w256{T}} = n_h{T} ~~ emit{[8]i16, '_mm256_castsi256_si128', v2i{x}} +def pair{a:T,b:T if width{T}==128} = n_d{T} ~~ emit{[8]i32, '_mm256_setr_m128i', a, b} -def widen{T==[4]f64, x:X & X==[4]i32} = emit{T, '_mm256_cvtepi32_pd', x} -def widen{T==[4]f64, x:X & X==[4]f32} = emit{T, '_mm256_cvtps_pd', x} -def widen{T==[4]f64, x:X & w128i{X} & elwidth{X}<32} = widen{T, widen{[4]i32, x}} -def widen{T, x:X & w256{X} & vcount{X}>vcount{T}} = widen{T, half{x,0}} +def widen{T==[4]f64, x:X if X==[4]i32} = emit{T, '_mm256_cvtepi32_pd', x} +def widen{T==[4]f64, x:X if X==[4]f32} = emit{T, '_mm256_cvtps_pd', x} +def widen{T==[4]f64, x:X if w128i{X} and elwidth{X}<32} = widen{T, widen{[4]i32, x}} +def widen{T, x:X if w256{X} and vcount{X}>vcount{T}} = widen{T, half{x,0}} # structural operations -def topBlend{f:T, t:T, m:M & w256{T,32} & w256i{M,32}} = T ~~ emit{[8]f32, '_mm256_blendv_ps', v2f{f}, v2f{t}, v2f{m}} -def topBlend{f:T, t:T, m:M & w256{T,64} & w256i{M,64}} = T ~~ emit{[4]f64, '_mm256_blendv_pd', v2d{f}, v2d{t}, v2d{m}} -def homBlend{f:T, t:T, m:M & w256{T}} = topBlend{f, t, m} +def topBlend{f:T, t:T, m:M if w256{T,32} and w256i{M,32}} = T ~~ emit{[8]f32, '_mm256_blendv_ps', v2f{f}, v2f{t}, v2f{m}} +def topBlend{f:T, t:T, m:M if w256{T,64} and w256i{M,64}} = T ~~ emit{[4]f64, '_mm256_blendv_pd', v2d{f}, v2d{t}, v2d{m}} +def homBlend{f:T, t:T, m:M if w256{T}} = topBlend{f, t, m} # mask stuff -def andAllZero{x:T, y:T & w256i{T}} = emit{u1, '_mm256_testz_si256', x, y} +def andAllZero{x:T, y:T if w256i{T}} = emit{u1, '_mm256_testz_si256', x, y} -def topMask{x:T & w256{T, 32}} = emit{u8, '_mm256_movemask_ps', v2f{x}} -def topMask{x:T & w256{T, 64}} = emit{u8, '_mm256_movemask_pd', v2d{x}} -def homMask{x:T & w256{T}} = topMask{x} +def topMask{x:T if w256{T, 32}} = emit{u8, '_mm256_movemask_ps', v2f{x}} +def topMask{x:T if w256{T, 64}} = emit{u8, '_mm256_movemask_pd', v2d{x}} +def homMask{x:T if w256{T}} = topMask{x} -def homAny{x:T & w256i{T} & elwidth{T}>=32} = homMask{[8]u32 ~~ x} != 0 -def homAll{x:T & w256i{T} & elwidth{T}>=32} = homMask{[8]u32 ~~ x} == 0xff +def homAny{x:T if w256i{T} and elwidth{T}>=32} = homMask{[8]u32 ~~ x} != 0 +def homAll{x:T if w256i{T} and elwidth{T}>=32} = homMask{[8]u32 ~~ x} == 0xff -def topAny{x:T & w256i{T} & elwidth{T}>=32} = topMask{x} != 0 -def topAll{x:T & w256i{T} & elwidth{T}>=32} = topMask{x} == (1<<vcount{T})-1 +def topAny{x:T if w256i{T} and elwidth{T}>=32} = topMask{x} != 0 +def topAll{x:T if w256i{T} and elwidth{T}>=32} = topMask{x} == (1<<vcount{T})-1 diff --git a/src/singeli/src/avx2.singeli b/src/singeli/src/avx2.singeli index d50e1543..a5ed5721 100644 --- a/src/singeli/src/avx2.singeli +++ b/src/singeli/src/avx2.singeli @@ -1,97 +1,97 @@ # questionable pack -def unpackQ{a:T,b:T & T==[32]i8 } = { tup{emit{[16]i16, '_mm256_unpacklo_epi8', a, b}, emit{[16]i16, '_mm256_unpackhi_epi8', a, b}}} -def unpackQ{a:T,b:T & T==[16]i16} = { tup{emit{[ 8]i32, '_mm256_unpacklo_epi16', a, b}, emit{[ 8]i32, '_mm256_unpackhi_epi16', a, b}}} -def unpackQ{a:T,b:T & T==[ 8]i32} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi32', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi32', a, b}}} -def unpackQ{a:T,b:T & T==[ 4]i64} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi64', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi64', a, b}}} +def unpackQ{a:T,b:T if T==[32]i8 } = { tup{emit{[16]i16, '_mm256_unpacklo_epi8', a, b}, emit{[16]i16, '_mm256_unpackhi_epi8', a, b}}} +def unpackQ{a:T,b:T if T==[16]i16} = { tup{emit{[ 8]i32, '_mm256_unpacklo_epi16', a, b}, emit{[ 8]i32, '_mm256_unpackhi_epi16', a, b}}} +def unpackQ{a:T,b:T if T==[ 8]i32} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi32', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi32', a, b}}} +def unpackQ{a:T,b:T if T==[ 4]i64} = { tup{emit{[ 4]i64, '_mm256_unpacklo_epi64', a, b}, emit{[ 4]i64, '_mm256_unpackhi_epi64', a, b}}} # inverse of questionable pack; these saturate the argument -def packQ{a:T,b:T & T==[16]i16} = emit{[32]i8, '_mm256_packs_epi16', a, b} -def packQ{a:T,b:T & T==[ 8]i32} = emit{[16]i16, '_mm256_packs_epi32', a, b} -def packQ{a:T,b:T & T==[16]u16} = emit{[32]u8, '_mm256_packus_epi16', a, b} -def packQ{a:T,b:T & T==[ 8]u32} = emit{[16]u16, '_mm256_packus_epi32', a, b} +def packQ{a:T,b:T if T==[16]i16} = emit{[32]i8, '_mm256_packs_epi16', a, b} +def packQ{a:T,b:T if T==[ 8]i32} = emit{[16]i16, '_mm256_packs_epi32', a, b} +def packQ{a:T,b:T if T==[16]u16} = emit{[32]u8, '_mm256_packus_epi16', a, b} +def packQ{a:T,b:T if T==[ 8]u32} = emit{[16]u16, '_mm256_packus_epi32', a, b} # super questionable pack - assumes high halves are zero -def packQQ{a:T,b:T & T==[4]i64} = emit{[8]i32, '_mm256_shuffle_epi32', a, 4b1120} | emit{[8]i32, '_mm256_shuffle_epi32', b, 4b2011} +def packQQ{a:T,b:T if T==[4]i64} = emit{[8]i32, '_mm256_shuffle_epi32', a, 4b1120} | emit{[8]i32, '_mm256_shuffle_epi32', b, 4b2011} def packQQ{{a, b}} = packQQ{a, b} # arith -def mulh {a:T,b:T & [16]i16==T} = emit{T, '_mm256_mulhi_epi16', a, b} -def mulh {a:T,b:T & [16]u16==T} = emit{T, '_mm256_mulhi_epu16', a, b} -def mul32{a:T,b:T & [ 4]i64==T} = emit{T, '_mm256_mul_epi32', a, b} # reads only low 32 bits of arguments -def mul32{a:T,b:T & [ 4]u64==T} = emit{T, '_mm256_mul_epu32', a, b} # reads only low 32 bits of arguments +def mulh {a:T,b:T if [16]i16==T} = emit{T, '_mm256_mulhi_epi16', a, b} +def mulh {a:T,b:T if [16]u16==T} = emit{T, '_mm256_mulhi_epu16', a, b} +def mul32{a:T,b:T if [ 4]i64==T} = emit{T, '_mm256_mul_epi32', a, b} # reads only low 32 bits of arguments +def mul32{a:T,b:T if [ 4]u64==T} = emit{T, '_mm256_mul_epu32', a, b} # reads only low 32 bits of arguments # structural operations -def shl{S==[16]u8, x:T, n & w256{T} & knum{n}} = T ~~ emit{T, '_mm256_bslli_epi128', x, n} -def shr{S==[16]u8, x:T, n & w256{T} & knum{n}} = T ~~ emit{T, '_mm256_bsrli_epi128', x, n} +def shl{S==[16]u8, x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bslli_epi128', x, n} +def shr{S==[16]u8, x:T, n if w256{T} and knum{n}} = T ~~ emit{T, '_mm256_bsrli_epi128', x, n} -def blend{L==[8]u16, a:T, b:T, m & w256{T} & knum{m}} = T ~~ emit{[16]i16, '_mm256_blend_epi16', v2i{a}, v2i{b}, m} -def blend{L==[8]u32, a:T, b:T, m & w256{T} & knum{m}} = T ~~ emit{[ 8]i32, '_mm256_blend_epi32', v2i{a}, v2i{b}, m} -def blend{L==[4]u64, a:T, b:T, m & w256{T} & knum{m}} = T ~~ emit{[ 4]f64, '_mm256_blend_pd', v2d{a}, v2d{b}, m} +def blend{L==[8]u16, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[16]i16, '_mm256_blend_epi16', v2i{a}, v2i{b}, m} +def blend{L==[8]u32, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 8]i32, '_mm256_blend_epi32', v2i{a}, v2i{b}, m} +def blend{L==[4]u64, a:T, b:T, m if w256{T} and knum{m}} = T ~~ emit{[ 4]f64, '_mm256_blend_pd', v2d{a}, v2d{b}, m} -def topBlend{f:T, t:T, m:M & w256{T, 8} & w256i{M, 8}} = T ~~ emit{[32]i8, '_mm256_blendv_epi8', v2i{f}, v2i{t}, v2i{m}} -def homBlend{f:T, t:T, m:M & w256{T, 8} & w256i{M, 8}} = topBlend{f, t, m} -def homBlend{f:T, t:T, m:M & w256{T, 16} & w256i{M,16}} = T ~~ topBlend{[32]i8~~f, [32]i8~~t, [32]i8~~m} +def topBlend{f:T, t:T, m:M if w256{T, 8} and w256i{M, 8}} = T ~~ emit{[32]i8, '_mm256_blendv_epi8', v2i{f}, v2i{t}, v2i{m}} +def homBlend{f:T, t:T, m:M if w256{T, 8} and w256i{M, 8}} = topBlend{f, t, m} +def homBlend{f:T, t:T, m:M if w256{T, 16} and w256i{M,16}} = T ~~ topBlend{[32]i8~~f, [32]i8~~t, [32]i8~~m} -def shuf{L, x:T, n & lvec{L,4,32} & w256{T} & knum{n}} = T ~~ emit{[8]i32, '_mm256_shuffle_epi32', v2i{x}, n} -def shuf{L, x:T, n & lvec{L,4,64} & w256{T} & knum{n}} = T ~~ emit{[4]f64, '_mm256_permute4x64_pd', v2d{x}, n} -def shufHalves{x:T, y:T, n & w256{T} & knum{n}} = T ~~ emit{[4]i64, '_mm256_permute2x128_si256', v2i{x}, v2i{y}, n} +def shuf{L, x:T, n if lvec{L,4,32} and w256{T} and knum{n}} = T ~~ emit{[8]i32, '_mm256_shuffle_epi32', v2i{x}, n} +def shuf{L, x:T, n if lvec{L,4,64} and w256{T} and knum{n}} = T ~~ emit{[4]f64, '_mm256_permute4x64_pd', v2d{x}, n} +def shufHalves{x:T, y:T, n if w256{T} and knum{n}} = T ~~ emit{[4]i64, '_mm256_permute2x128_si256', v2i{x}, v2i{y}, n} -def sel{L, x:T, i:I & w256{T} & lvec{L,8,32} & w256{I,32}} = T ~~ emit{[32]u8, '_mm256_permutevar8x32_epi32', v2i{x}, i} -def sel{L, x:T, i:I & w256{T} & lvec{L,16,8} & w256{I, 8}} = T ~~ emit{[32]u8, '_mm256_shuffle_epi8', v2i{x}, i} +def sel{L, x:T, i:I if w256{T} and lvec{L,8,32} and w256{I,32}} = T ~~ emit{[32]u8, '_mm256_permutevar8x32_epi32', v2i{x}, i} +def sel{L, x:T, i:I if w256{T} and lvec{L,16,8} and w256{I, 8}} = T ~~ emit{[32]u8, '_mm256_shuffle_epi8', v2i{x}, i} # masked store; F variants may not be a single instruction -def topMaskStore{p:P, m:M, v:T & w256i{M, 32} & w256{T,elwidth{M}} & eltype{P}==T} = emit{void, '_mm256_maskstore_epi32', *i32~~p, m, [8]i32~~v} -def topMaskStore{p:P, m:M, v:T & w256i{M, 64} & w256{T,elwidth{M}} & eltype{P}==T} = emit{void, '_mm256_maskstore_pd', *f64~~p, m, [4]f64~~v} -def homMaskStore{p:P, m:M, v:T & w256i{M} & w256{T,elwidth{M}} & eltype{P}==T} = topMaskStore{p, m, v} +def topMaskStore{p:P, m:M, v:T if w256i{M, 32} and w256{T,elwidth{M}} and eltype{P}==T} = emit{void, '_mm256_maskstore_epi32', *i32~~p, m, [8]i32~~v} +def topMaskStore{p:P, m:M, v:T if w256i{M, 64} and w256{T,elwidth{M}} and eltype{P}==T} = emit{void, '_mm256_maskstore_pd', *f64~~p, m, [4]f64~~v} +def homMaskStore{p:P, m:M, v:T if w256i{M} and w256{T,elwidth{M}} and eltype{P}==T} = topMaskStore{p, m, v} -def topMaskStoreF{p:P, m:M, v:T & w256i{M} & elwidth{T}>=32} = topMaskStore{p,m,v} -def homMaskStoreF{p:P, m:M, v:T & w256i{M} & elwidth{T}>=32} = topMaskStore{p,m,v} -def homMaskStoreF{p:P, m:M, v:T & w256i{M} & elwidth{T}<=16 & w256{T,elwidth{M}} & eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}} +def topMaskStoreF{p:P, m:M, v:T if w256i{M} and elwidth{T}>=32} = topMaskStore{p,m,v} +def homMaskStoreF{p:P, m:M, v:T if w256i{M} and elwidth{T}>=32} = topMaskStore{p,m,v} +def homMaskStoreF{p:P, m:M, v:T if w256i{M} and elwidth{T}<=16 and w256{T,elwidth{M}} and eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}} # mask stuff -def topMask{x:T & w256{T, 8}} = emit{u32, '_mm256_movemask_epi8', x} -def topMask{x:T & w256{T, 16}} = { +def topMask{x:T if w256{T, 8}} = emit{u32, '_mm256_movemask_epi8', x} +def topMask{x:T if w256{T, 16}} = { msk:u32 = topMask{emit{[32]u8, '_mm256_packs_epi16', x, [16]u16**0}} (msk&255) | (msk>>8) } -def homAny{x:T & w256i{T}} = ~emit{u1, '_mm256_testz_si256', v2i{x}, v2i{x}} -def homAll{x:T & w256i{T}} = homMask{[32]u8 ~~ x} == 0xffff_ffff -def topAny{x:T & w256i{T}} = topMask{x} != 0 -def topAll{x:T & w256i{T}} = topMask{x} == (1<<vcount{T})-1 -def homMask{a:T, b:T & w256i{T,16}} = homMask{shuf{[4]u64, packQ{ty_s{a},ty_s{b}}, 4b3120}} +def homAny{x:T if w256i{T}} = ~emit{u1, '_mm256_testz_si256', v2i{x}, v2i{x}} +def homAll{x:T if w256i{T}} = homMask{[32]u8 ~~ x} == 0xffff_ffff +def topAny{x:T if w256i{T}} = topMask{x} != 0 +def topAll{x:T if w256i{T}} = topMask{x} == (1<<vcount{T})-1 +def homMask{a:T, b:T if w256i{T,16}} = homMask{shuf{[4]u64, packQ{ty_s{a},ty_s{b}}, 4b3120}} -def topAny{x:T & w256i{T,32}} = ~emit{u1, '_mm256_testz_ps', v2f{x}, v2f{x}} -def topAny{x:T & w256i{T,64}} = ~emit{u1, '_mm256_testz_pd', v2d{x}, v2d{x}} -def homAny{x:T & w256i{T} & elwidth{T}>=32} = topAny{x} +def topAny{x:T if w256i{T,32}} = ~emit{u1, '_mm256_testz_ps', v2f{x}, v2f{x}} +def topAny{x:T if w256i{T,64}} = ~emit{u1, '_mm256_testz_pd', v2d{x}, v2d{x}} +def homAny{x:T if w256i{T} and elwidth{T}>=32} = topAny{x} -def topAny{x:T & w256i{T,16}} = homAny{[16]i16~~x < [16]i16**0} -def topAll{x:T & w256i{T,16}} = homAll{[16]i16~~x < [16]i16**0} +def topAny{x:T if w256i{T,16}} = homAny{[16]i16~~x < [16]i16**0} +def topAll{x:T if w256i{T,16}} = homAll{[16]i16~~x < [16]i16**0} # conversion -def widen{T==[16]u16, x:X & X==[16]u8} = emit{T, '_mm256_cvtepu8_epi16', x}; def widen{T==[16]i16, x:X & X==[16]i8} = emit{T, '_mm256_cvtepi8_epi16', x} -def widen{T==[ 8]u32, x:X & X==[16]u8} = emit{T, '_mm256_cvtepu8_epi32', x}; def widen{T==[ 8]i32, x:X & X==[16]i8} = emit{T, '_mm256_cvtepi8_epi32', x} -def widen{T==[ 8]u32, x:X & X==[8]u16} = emit{T, '_mm256_cvtepu16_epi32', x}; def widen{T==[ 8]i32, x:X & X==[8]i16} = emit{T, '_mm256_cvtepi16_epi32', x} -def widen{T==[ 4]u64, x:X & X==[16]u8} = emit{T, '_mm256_cvtepu8_epi64', x}; def widen{T==[ 4]i64, x:X & X==[16]i8} = emit{T, '_mm256_cvtepi8_epi64', x} -def widen{T==[ 4]u64, x:X & X==[8]u16} = emit{T, '_mm256_cvtepu16_epi64', x}; def widen{T==[ 4]i64, x:X & X==[8]i16} = emit{T, '_mm256_cvtepi16_epi64', x} -def widen{T==[ 4]u64, x:X & X==[4]u32} = emit{T, '_mm256_cvtepu32_epi64', x}; def widen{T==[ 4]i64, x:X & X==[4]i32} = emit{T, '_mm256_cvtepi32_epi64', x} - -def narrow{T, x:X & w256i{X,32} & width{T}==8} = { +def widen{T==[16]u16, x:X if X==[16]u8} = emit{T, '_mm256_cvtepu8_epi16', x}; def widen{T==[16]i16, x:X if X==[16]i8} = emit{T, '_mm256_cvtepi8_epi16', x} +def widen{T==[ 8]u32, x:X if X==[16]u8} = emit{T, '_mm256_cvtepu8_epi32', x}; def widen{T==[ 8]i32, x:X if X==[16]i8} = emit{T, '_mm256_cvtepi8_epi32', x} +def widen{T==[ 8]u32, x:X if X==[8]u16} = emit{T, '_mm256_cvtepu16_epi32', x}; def widen{T==[ 8]i32, x:X if X==[8]i16} = emit{T, '_mm256_cvtepi16_epi32', x} +def widen{T==[ 4]u64, x:X if X==[16]u8} = emit{T, '_mm256_cvtepu8_epi64', x}; def widen{T==[ 4]i64, x:X if X==[16]i8} = emit{T, '_mm256_cvtepi8_epi64', x} +def widen{T==[ 4]u64, x:X if X==[8]u16} = emit{T, '_mm256_cvtepu16_epi64', x}; def widen{T==[ 4]i64, x:X if X==[8]i16} = emit{T, '_mm256_cvtepi16_epi64', x} +def widen{T==[ 4]u64, x:X if X==[4]u32} = emit{T, '_mm256_cvtepu32_epi64', x}; def widen{T==[ 4]i64, x:X if X==[4]i32} = emit{T, '_mm256_cvtepi32_epi64', x} + +def narrow{T, x:X if w256i{X,32} and width{T}==8} = { a:= packQ{x, x} b:= packQ{a, a} re_el{T, sel{[8]u32, b, make{[8]i32, 0,4,0,4,0,4,0,4}}} } -def narrow{T, x:X & w256i{X,32} & width{T}==16} = re_el{T, shuf{[4]u64, packQ{x, x}, 4b3120}} -def narrow{T, x:X & w256i{X,16} & width{T}== 8} = re_el{T, shuf{[4]u64, packQ{x, x}, 4b3120}} +def narrow{T, x:X if w256i{X,32} and width{T}==16} = re_el{T, shuf{[4]u64, packQ{x, x}, 4b3120}} +def narrow{T, x:X if w256i{X,16} and width{T}== 8} = re_el{T, shuf{[4]u64, packQ{x, x}, 4b3120}} -def narrow{T, x:X & w256f{X,64} & T<i32} = narrow{T, narrow{i32, x}} -def narrow{T, x:X & w256f{X,64} & T==i32} = emit{[4]i32, '_mm256_cvtpd_epi32', x} +def narrow{T, x:X if w256f{X,64} and T<i32} = narrow{T, narrow{i32, x}} +def narrow{T, x:X if w256f{X,64} and T==i32} = emit{[4]i32, '_mm256_cvtpd_epi32', x} -def narrow{T, x:X & w256u{X,64} & T==u32} = re_el{T, sel{[8]i32, x, make{[8]i32, 2*iota{8}}}} -def narrow{T, x:X & w256u{X,64} & T==u16} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, (iota{32}>>1<<2) | (iota{32}&1)}}} -def narrow{T, x:X & w256u{X,64} & T== u8} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, 4*iota{32}}}} +def narrow{T, x:X if w256u{X,64} and T==u32} = re_el{T, sel{[8]i32, x, make{[8]i32, 2*iota{8}}}} +def narrow{T, x:X if w256u{X,64} and T==u16} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, (iota{32}>>1<<2) | (iota{32}&1)}}} +def narrow{T, x:X if w256u{X,64} and T== u8} = re_el{T, sel{[16]i8, narrow{u32,x}, make{[32]i8, 4*iota{32}}}} -def cvt2{T, x:X & T==i32 & X==[4]f64} = emit{[4]i32, '_mm256_cvtpd_epi32', x} -def cvt2{T, x:X & T==f64 & X==[4]i32} = emit{[4]f64, '_mm256_cvtepi32_pd', x} +def cvt2{T, x:X if T==i32 and X==[4]f64} = emit{[4]i32, '_mm256_cvtpd_epi32', x} +def cvt2{T, x:X if T==f64 and X==[4]i32} = emit{[4]f64, '_mm256_cvtepi32_pd', x} diff --git a/src/singeli/src/avx512.singeli b/src/singeli/src/avx512.singeli index 9f7936fb..737ff41c 100644 --- a/src/singeli/src/avx512.singeli +++ b/src/singeli/src/avx512.singeli @@ -4,30 +4,30 @@ local { if (isfloat{T}) (if (width{T}==32) 'ps' else 'pd') else merge{'epi', fmtnat{width{T}}} } - def suf{V & isvec{V}} = suf{eltype{V}} + def suf{V if isvec{V}} = suf{eltype{V}} def pref{w} = merge{'_mm', if (w==128) '' else fmtnat{w}, '_'} - def pref{V & isvec{V}} = pref{width{V}} + def pref{V if isvec{V}} = pref{width{V}} } local def re_mask{M, sub} = { def l = vcount{M}; def w = max{32,l} sub{fmtnat{l}, fmtnat{w}, ty_u{w}} } -def reinterpret{M, a:T & ismask{M} & width{T}==width{M}} = { +def reinterpret{M, a:T if ismask{M} and width{T}==width{M}} = { re_mask{M, {l,w,W} => emit{M, merge{'_cvtu',w,'_mask',l}, promote{W, a}}} } -def reinterpret{T, a:M & ismask{M} & width{T}==width{M}} = { +def reinterpret{T, a:M if ismask{M} and width{T}==width{M}} = { re_mask{M, {l,w,W} => cast_i{T, emit{W, merge{'_cvtmask',l,'_u',w}, a}}} } -def maskStore{p:*V, m:M, v:V & ismask{M} & isvec{V} & vcount{M}==vcount{V}} = { +def maskStore{p:*V, m:M, v:V if ismask{M} and isvec{V} and vcount{M}==vcount{V}} = { emit{void, merge{pref{V}, 'mask_storeu_', suf{V}}, p, m, v} } def topMaskReg{x:V} = emit{[vcount{V}]u1, merge{pref{V},'mov',suf{V},'_mask'}, x} -def topMask{x:T & isvec{T} & 512==width{T}} = ty_u{vcount{T}}~~topMaskReg{x} -def homMask{x:T & isvec{T} & 512==width{T}} = topMask{x} +def topMask{x:T if isvec{T} and 512==width{T}} = ty_u{vcount{T}}~~topMaskReg{x} +def homMask{x:T if isvec{T} and 512==width{T}} = topMask{x} -def maskToHom{T, x:M & ismask{M} & isvec{T} & vcount{M}==vcount{T}} = { +def maskToHom{T, x:M if ismask{M} and isvec{T} and vcount{M}==vcount{T}} = { emit{T, merge{pref{T},'movm_',suf{T}}, x} } diff --git a/src/singeli/src/base.singeli b/src/singeli/src/base.singeli index af151f78..b8fe0710 100644 --- a/src/singeli/src/base.singeli +++ b/src/singeli/src/base.singeli @@ -18,38 +18,38 @@ def istup = ktup def isunsigned{T} = isint{T} & ~issigned{T} -def isvec {T} = 0; def isvec {T & istype{T}} = same{typekind{T},'vector'} -def isprim{T} = 0; def isprim{T & istype{T}} = same{typekind{T},'primitive'} -def isptr {T} = 0; def isptr {T & istype{T}} = same{typekind{T},'pointer'} +def isvec {T} = 0; def isvec {T if istype{T}} = same{typekind{T},'vector'} +def isprim{T} = 0; def isprim{T if istype{T}} = same{typekind{T},'primitive'} +def isptr {T} = 0; def isptr {T if istype{T}} = same{typekind{T},'pointer'} def elwidth{T} = width{eltype{T}} oper &~ andnot infix none 35 -def andnot{a, b & anyNum{a} & anyNum{b}} = a & ~b +def andnot{a, b if anyNum{a} and anyNum{b}} = a & ~b -def load{p:P, n & isvec{eltype{P}}} = assert{0} -def store{p:P, n, v & isvec{eltype{P}}} = assert{0} -def load{p:P & isptr{P}} = load{p, 0} -# def store{p:P, v & isptr{P}} = store{p, 0, v} -def loadu{p:T & isunsigned{eltype{T}}} = emit{eltype{T}, merge{'loadu_u',fmtnat{elwidth{T}}}, p} -def storeu{p:T, v:(eltype{T}) & isunsigned{eltype{T}}} = emit{void, merge{'storeu_u',fmtnat{elwidth{T}}}, p, v} -def loadu{p:T & issigned{eltype{T}}} = loadu {*ty_u{eltype{T}} ~~ p} -def storeu{p:T, v:(eltype{T}) & issigned{eltype{T}}} = storeu{*ty_u{eltype{T}} ~~ p, ty_u{v}} -def loadu{p:T & elwidth{T}==8} = load{p} -def storeu{p:T, v:(eltype{T}) & elwidth{T}==8} = store{p, v} +def load{p:P, n if isvec{eltype{P}}} = assert{0} +def store{p:P, n, v if isvec{eltype{P}}} = assert{0} +def load{p:P if isptr{P}} = load{p, 0} +# def store{p:P, v if isptr{P}} = store{p, 0, v} +def loadu{p:T if isunsigned{eltype{T}}} = emit{eltype{T}, merge{'loadu_u',fmtnat{elwidth{T}}}, p} +def storeu{p:T, v:(eltype{T}) if isunsigned{eltype{T}}} = emit{void, merge{'storeu_u',fmtnat{elwidth{T}}}, p, v} +def loadu{p:T if issigned{eltype{T}}} = loadu {*ty_u{eltype{T}} ~~ p} +def storeu{p:T, v:(eltype{T}) if issigned{eltype{T}}} = storeu{*ty_u{eltype{T}} ~~ p, ty_u{v}} +def loadu{p:T if elwidth{T}==8} = load{p} +def storeu{p:T, v:(eltype{T}) if elwidth{T}==8} = store{p, v} -def reinterpret{T, x:X & T==X} = x +def reinterpret{T, x:X if T==X} = x def exportN{f, ...ns} = each{export{.,f}, ns} def exportT{name, fs} = { v:*type{select{fs,0}} = fs; export{name, v} } # hints -def rare{x & knum{x}} = x +def rare{x if knum{x}} = x def rare{x:(u1)} = emit{u1, '__builtin_expect', x, 0} -def assert{x & x==0} = assert{'failed assertion'} -def assert{x & x==1} = 1 +def assert{x if x==0} = assert{'failed assertion'} +def assert{x if x==1} = 1 def unreachable{} = emit{void, 'si_unreachable'} def assert{x:(u1)} = { if (not x) emit{void, 'si_unreachable'} } @@ -65,24 +65,24 @@ def anyNum{x} = isconst{x} | knum{x} def anyNum{x:T} = isprim{T} def anyInt{x} = 0 -def anyInt{x & knum{x}} = (x>>0) == x -def anyInt{x & isreg{x}|isconst{x}} = isint{x} +def anyInt{x if knum{x}} = (x>>0) == x +def anyInt{x if isreg{x}|isconst{x}} = isint{x} # vector width/type checks -def w64 {T} = 0; def w64 {T & isvec{T}} = width{T}==64 -def w128{T} = 0; def w128{T & isvec{T}} = width{T}==128 -def w256{T} = 0; def w256{T & isvec{T}} = width{T}==256 -def w64 {T,w} = 0; def w64 {T,w & w64{T}} = elwidth{T}==w -def w128{T,w} = 0; def w128{T,w & w128{T}} = elwidth{T}==w -def w256{T,w} = 0; def w256{T,w & w256{T}} = elwidth{T}==w +def w64 {T} = 0; def w64 {T if isvec{T}} = width{T}==64 +def w128{T} = 0; def w128{T if isvec{T}} = width{T}==128 +def w256{T} = 0; def w256{T if isvec{T}} = width{T}==256 +def w64 {T,w} = 0; def w64 {T,w if w64{T}} = elwidth{T}==w +def w128{T,w} = 0; def w128{T,w if w128{T}} = elwidth{T}==w +def w256{T,w} = 0; def w256{T,w if w256{T}} = elwidth{T}==w # width+type checks def genchk{B, F} = { def r{T} = 0 - def r{T & B{T}} = F{eltype{T}} + def r{T if B{T}} = F{eltype{T}} def r{T,w} = 0 - def r{T,w & B{T}} = F{eltype{T}} & (elwidth{T}==w) - def r{T & ~isvec{T}} = 0 + def r{T,w if B{T}} = F{eltype{T}} & (elwidth{T}==w) + def r{T if ~isvec{T}} = 0 r } def w256i = genchk{w256, isint}; def w128i = genchk{w128, isint}; def w64i = genchk{w64, isint} @@ -92,14 +92,14 @@ def w256f = genchk{w256, isfloat}; def w128f = genchk{w128, isfloat}; de -def trunc{T, x:U & isint{T} & isint{U} & T<=U} = emit{T, '', x} -def trunc{T, x & knum{x}} = cast{T, x} +def trunc{T, x:U if isint{T} and isint{U} and T<=U} = emit{T, '', x} +def trunc{T, x if knum{x}} = cast{T, x} -def tern{c, T, F & anyInt{c}} = { +def tern{c, T, F if anyInt{c}} = { if(c) T else F } -def tern{c, t:T, f:T & anyInt{c}} = { +def tern{c, t:T, f:T if anyInt{c}} = { res:T = f if (c) res = t res @@ -111,42 +111,42 @@ def re_el{E, V} = [width{V}/width{E}]E def re_el{E, x:V} = re_el{E,V} ~~ x local def qualChange{q} = { - def f{w & knum{w}} = primtype{q, w} - def f{T & isprim{T}} = primtype{q, width{T}} - def f{T & isvec{T}} = re_el{f{eltype{T}}, T} + def f{w if knum{w}} = primtype{q, w} + def f{T if isprim{T}} = primtype{q, width{T}} + def f{T if isvec{T}} = re_el{f{eltype{T}}, T} def f{x:T} = f{T}~~x } def ty_u = qualChange{'u'} def ty_s = qualChange{'i'} def ty_f = qualChange{'f'} -def w_n{T, w & isprim{T}} = primtype{quality{T}, w} -def w_d{T & isprim{T}} = to_w{T, width{T}*2} # double/halve primitive type width -def w_h{T & isprim{T}} = to_w{T, width{T}/2} +def w_n{T, w if isprim{T}} = primtype{quality{T}, w} +def w_d{T if isprim{T}} = to_w{T, width{T}*2} # double/halve primitive type width +def w_h{T if isprim{T}} = to_w{T, width{T}/2} -def n_d{T & isvec{T}} = [vcount{T}*2](eltype{T}) # double/halve vector count -def n_h{T & isvec{T}} = [vcount{T}/2](eltype{T}) +def n_d{T if isvec{T}} = [vcount{T}*2](eltype{T}) # double/halve vector count +def n_h{T if isvec{T}} = [vcount{T}/2](eltype{T}) -def el_d{T & isvec{T}} = [vcount{T}](w_d{eltype{T}}) # double/halve element width, preserving count -def el_h{T & isvec{T}} = [vcount{T}](w_h{eltype{T}}) +def el_d{T if isvec{T}} = [vcount{T}](w_d{eltype{T}}) # double/halve element width, preserving count +def el_h{T if isvec{T}} = [vcount{T}](w_h{eltype{T}}) -def el_m{T & isvec{T}} = re_el{w_d{eltype{T}}, T}; def el_m{x:T} = re_el{T}~~x # double/halve element width, preserving width -def el_s{T & isvec{T}} = re_el{w_h{eltype{T}}, T}; def el_s{x:T} = re_el{T}~~x +def el_m{T if isvec{T}} = re_el{w_d{eltype{T}}, T}; def el_m{x:T} = re_el{T}~~x # double/halve element width, preserving width +def el_s{T if isvec{T}} = re_el{w_h{eltype{T}}, T}; def el_s{x:T} = re_el{T}~~x # type stats -def minvalue{T & isunsigned{T}} = 0 -def maxvalue{T & isunsigned{T}} = (1<<width{T})-1 -def minvalue{T & issigned{T}} = - (1<<(width{T}-1)) -def maxvalue{T & issigned{T}} = (1<<(width{T}-1))-1 +def minvalue{T if isunsigned{T}} = 0 +def maxvalue{T if isunsigned{T}} = (1<<width{T})-1 +def minvalue{T if issigned{T}} = - (1<<(width{T}-1)) +def maxvalue{T if issigned{T}} = (1<<(width{T}-1))-1 # tuple operations -def iota{n & knum{n}} = range{n} -def broadcast{T, v & isprim{T}} = v -def broadcast{n, v & knum{n}} = each{{_}=>v, range{n}} -def collect{vars,begin,end,iter & knum{begin} & knum{end}} = { +def iota{n if knum{n}} = range{n} +def broadcast{T, v if isprim{T}} = v +def broadcast{n, v if knum{n}} = each{{_}=>v, range{n}} +def collect{vars,begin,end,iter if knum{begin} and knum{end}} = { each{iter{., vars}, range{end-begin}+begin} } @@ -162,7 +162,7 @@ def fast_BMI2{} = if (SLOW_PDEP) 0 else hasarch{'BMI2'} # test if vector has a specific width & element type def lvec{T, n, w} = 0 -def lvec{T, n, w & isvec{T} & vcount{T}==n & elwidth{T}==w} = 1 +def lvec{T, n, w if isvec{T} and vcount{T}==n and elwidth{T}==w} = 1 # base cases def { @@ -176,7 +176,7 @@ def { def homMaskX{a:T} = tup{1, homMask{a}} # tup{n,mask}; mask with each bit repeated n times def ctzX{{n,v}} = ctz{v}/n # ctz for a result of homMaskX -def homMask{...vs & length{vs}>1} = { +def homMask{...vs if length{vs}>1} = { def n = length{vs} def T = oneType{vs} def RT = ty_u{max{8,vcount{T}*n}} @@ -204,15 +204,15 @@ def mzipHi{a:T, b:T} = el_m{T} ~~ zipHi{a, b} def packQ{{a, b}} = packQ{a, b} def pair{{a, b}} = pair{a, b} -def widen{T, x:X & T==X} = x -def narrow{T, x:X & T==eltype{X}} = x -def undefPromote{T, x:X & T==X} = x -def cvt{T, x:X & T==eltype{X}} = x +def widen{T, x:X if T==X} = x +def narrow{T, x:X if T==eltype{X}} = x +def undefPromote{T, x:X if T==X} = x +def cvt{T, x:X if T==eltype{X}} = x -def broadcast{T, v & isvec{T}} = vec_broadcast{T, promote{eltype{T},v}} -def make{T, ...xs & isvec{T}} = vec_make{T, ...xs} -def iota{T & isvec{T}} = make{T, ...iota{vcount{T}}} -def absu{a:T & isvec{T}} = ty_u{abs{a}} +def broadcast{T, v if isvec{T}} = vec_broadcast{T, promote{eltype{T},v}} +def make{T, ...xs if isvec{T}} = vec_make{T, ...xs} +def iota{T if isvec{T}} = make{T, ...iota{vcount{T}}} +def absu{a:T if isvec{T}} = ty_u{abs{a}} def floor = __floor def ceil = __ceil @@ -226,29 +226,29 @@ def sqrt = __sqrt # more arith -def min{a, b & anyNum{a} & anyNum{b}} = tern{a<b, a, b} -def max{a, b & anyNum{a} & anyNum{b}} = tern{a>b, a, b} +def min{a, b if anyNum{a} and anyNum{b}} = tern{a<b, a, b} +def max{a, b if anyNum{a} and anyNum{b}} = tern{a>b, a, b} def cdiv{a,b} = (a+b-1)/b # ceiling divide -def cdiv{a,b & knum{a} & knum{b}} = ceil{a/b} -def popc{x:T & isint{T} & width{T}==64} = emit{ux, '__builtin_popcountll', x} -def popc{x:T & isint{T} & width{T}<=32} = emit{ux, '__builtin_popcount', x} -def ctz{x:T & isint{T} & width{T}==64} = emit{ux, '__builtin_ctzll', x} -def ctz{x:T & isint{T} & width{T}<=32} = emit{ux, '__builtin_ctz', x} -def clz{x:T & isint{T} & width{T}==64} = emit{ux, '__builtin_clzll', x} -def clz{x:T & isint{T} & width{T}<=32} = emit{ux, '__builtin_clz', x} +def cdiv{a,b if knum{a} and knum{b}} = ceil{a/b} +def popc{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_popcountll', x} +def popc{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_popcount', x} +def ctz{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_ctzll', x} +def ctz{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_ctz', x} +def clz{x:T if isint{T} and width{T}==64} = emit{ux, '__builtin_clzll', x} +def clz{x:T if isint{T} and width{T}<=32} = emit{ux, '__builtin_clz', x} # count-leading-zeros complement, less type-dependent -def clzc{x:T & isint{T} & width{T}==64} = 64-clz{x} -def clzc{x:T & isint{T} & width{T}<=32} = 32-clz{x} +def clzc{x:T if isint{T} and width{T}==64} = 64-clz{x} +def clzc{x:T if isint{T} and width{T}<=32} = 32-clz{x} def ceil_log2{n} = clzc{n-1} -def truncBits{n, v & n<=8} = cast_i{u8, v} -def truncBits{n, v & n==16} = cast_i{u16, v} -def truncBits{n, v & n==32} = cast_i{u32, v} -def truncBits{n, v & n==64} = cast_i{u64, v} +def truncBits{n, v if n<=8} = cast_i{u8, v} +def truncBits{n, v if n==16} = cast_i{u16, v} +def truncBits{n, v if n==32} = cast_i{u32, v} +def truncBits{n, v if n==64} = cast_i{u64, v} # base-2 log of a constant power of two -def lb{n & knum{n} & (n>>1<<1) == n & n>0} = lb{n>>1}+1 +def lb{n if knum{n} and (n>>1<<1) == n and n>0} = lb{n>>1}+1 def lb{n==1} = 0 def zlow{n,x} = (x >> n) << n # zero out n least significant bits @@ -256,11 +256,11 @@ def tail{n,x} = x & ((1<<n) - 1) # get the n least significant bits def bit {k,x} = x & (1<<k) # get the k-th bit # range logic -def inRangeLen{x:TS, start, count & issigned{eltype{TS}}} = { # ∊ [start;start+count) +def inRangeLen{x:TS, start, count if issigned{eltype{TS}}} = { # ∊ [start;start+count) def TU = ty_u{TS} (TU~~(x-TS**start)) < TU**count } -def inRangeLen{x:TU, start, count & isunsigned{eltype{TU}}} = { # ∊ [start;start+count) +def inRangeLen{x:TU, start, count if isunsigned{eltype{TU}}} = { # ∊ [start;start+count) def TS = ty_s{TU} def h = 1 << (elwidth{TU}-1) (TS~~(x-TU**(start-h))) < TS**(count-h) @@ -270,8 +270,8 @@ def inRangeExcl{x:T, start, end} = inRangeLen{x, start, end-start} # ∊ [start; -def load{p,i & kgen{p}} = p{i} -def store{p,i,x & kgen{p}} = p{i,x} +def load{p,i if kgen{p}} = p{i} +def store{p,i,x if kgen{p}} = p{i,x} def tptr{l,s} = { # create "pointer" generator with given load & store definitions def r{i} = l{i} def r{i,x} = s{i,x} @@ -329,7 +329,7 @@ def eachx{F, ...args} = { each{F, ...each{{x} => if (istup{x}) x else l**x, args}} } -def undef{T, n & istype{T}} = @collect(n) undef{T} -def undef{Ts & istup{Ts}} = each{undef, Ts} +def undef{T, n if istype{T}} = @collect(n) undef{T} +def undef{Ts if istup{Ts}} = each{undef, Ts} def undef{x:T} = undef{T} -def undef{T & istype{T}} = { reg:=undefined{T} } +def undef{T if istype{T}} = { reg:=undefined{T} } diff --git a/src/singeli/src/bins.singeli b/src/singeli/src/bins.singeli index 08459bce..128ab8bb 100644 --- a/src/singeli/src/bins.singeli +++ b/src/singeli/src/bins.singeli @@ -41,20 +41,20 @@ fn max_scan{T, up}(x:*T, len:u64) : void = { def getsel{...x} = assert{'shuffling not supported', show{...x}} if_inline (hasarch{'AVX2'}) { - def getsel{h:H & lvec{H, 16, 8}} = { + def getsel{h:H if lvec{H, 16, 8}} = { sel{H, pair{h,h}, .} } - def getsel{v:V & lvec{V, 32, 8}} = { + def getsel{v:V if lvec{V, 32, 8}} = { def H = n_h{V} vtop := V**(vcount{V}/2) hs := each{shuf{[4]u64, v, .}, tup{4b3232, 4b1010}} {i} => homBlend{...each{sel{H,.,i}, hs}, V~~i<vtop} } - def getsel{v:V & lvec{V, 8, 32}} = sel{V, v, .} + def getsel{v:V if lvec{V, 8, 32}} = sel{V, v, .} } # Move evens to half 0 and odds to half 1 -def uninterleave{x:V & hasarch{'AVX2'}} = { +def uninterleave{x:V if hasarch{'AVX2'}} = { def vl = vcount{V}; def bytes = width{eltype{V}}/8 def i = 2*iota{vl/4} def i2= join{table{+, bytes*merge{i,i+1}, iota{bytes}}} @@ -108,7 +108,7 @@ fn write_indices{I,T}(t:*I, w:*T, n:u64) : void = { } setlabel{break} } -fn write_indices{I,T & width{I}==8}(t:*I, w:*T, n:u64) : void = { +fn write_indices{I,T if width{I}==8}(t:*I, w:*T, n:u64) : void = { @for (w over j to n) store{t, w, cast_i{I, j+1}} } def bins_lookup{I, T, up, w:*T, wn:(u64), x:*T, xn:(u64), rp:(*void)} = { @@ -129,7 +129,7 @@ def bins_lookup{I, T, up, w:*T, wn:(u64), x:*T, xn:(u64), rp:(*void)} = { tfree{t0} } -def bins_vectab_i8{up, w, wn, x, xn, rp, t0, t, done & hasarch{'AVX2'}} = { +def bins_vectab_i8{up, w, wn, x, xn, rp, t0, t, done if hasarch{'AVX2'}} = { assert{wn < 128} # Total must fit in i8 def vl = 32 def T = i8 @@ -200,7 +200,7 @@ def bins_vectab_i8{up, w, wn, x, xn, rp, t0, t, done & hasarch{'AVX2'}} = { } # Binary search within vector registers -def bin_search_vec{prim, T, w:*T, wn, x:*T, xn, rp, maxwn & hasarch{'AVX2'}} = { +def bin_search_vec{prim, T, w:*T, wn, x:*T, xn, rp, maxwn if hasarch{'AVX2'}} = { def up = prim != '⍒' def search = (prim=='∊') | (prim=='⊐') assert{wn > 1}; assert{wn < maxwn} diff --git a/src/singeli/src/bitops.singeli b/src/singeli/src/bitops.singeli index 5dcf9c62..6d56a11f 100644 --- a/src/singeli/src/bitops.singeli +++ b/src/singeli/src/bitops.singeli @@ -4,13 +4,13 @@ def b_get{x:(*u64), n:(ux)} = { ((load{x,n>>6}>>(n&63)) & 1) != 0 } -def b_getBatchLo{sz, x:(*u64), n:(ux) & sz==2} = (load{*u8~~x, n>>2} >> cast_i{u8, (n&3)*2}) -def b_getBatchLo{sz, x:(*u64), n:(ux) & sz==4} = (load{*u8~~x, n>>1} >> cast_i{u8, (n&1)*4}) -def b_getBatchLo{sz, x:(*u64), n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n} +def b_getBatchLo{sz, x:(*u64), n:(ux) if sz==2} = (load{*u8~~x, n>>2} >> cast_i{u8, (n&3)*2}) +def b_getBatchLo{sz, x:(*u64), n:(ux) if sz==4} = (load{*u8~~x, n>>1} >> cast_i{u8, (n&1)*4}) +def b_getBatchLo{sz, x:(*u64), n:(ux) if sz>=8} = load{*ty_u{sz}~~x, n} -def b_getBatch{sz, x:(*u64), n:(ux) & sz==2} = b_getBatchLo{sz, x, n} & 3 -def b_getBatch{sz, x:(*u64), n:(ux) & sz==4} = b_getBatchLo{sz, x, n} & 15 -def b_getBatch{sz, x:(*u64), n:(ux) & sz>=8} = load{*ty_u{sz}~~x, n} +def b_getBatch{sz, x:(*u64), n:(ux) if sz==2} = b_getBatchLo{sz, x, n} & 3 +def b_getBatch{sz, x:(*u64), n:(ux) if sz==4} = b_getBatchLo{sz, x, n} & 15 +def b_getBatch{sz, x:(*u64), n:(ux) if sz>=8} = load{*ty_u{sz}~~x, n} def b_set{x:(*u64), n:(ux), v:(u1)} = { @@ -30,7 +30,7 @@ def b_setBatch{sz, x:(*u64), n:(ux), v} = { store{x, n/am, w} } -def b_setBatch{sz, x:(*u64), n:(ux), v & sz==4} = { +def b_setBatch{sz, x:(*u64), n:(ux), v if sz==4} = { x8:= *u8 ~~ x #w:u64 = cast_i{u64, load{x8,n/2}} @@ -49,10 +49,10 @@ def b_setBatch{sz, x:(*u64), n:(ux), v & sz==4} = { store{x8, n/2, cast_i{u8,w}} } -def b_setBatch{sz, x:(*u64), n:(ux), v & sz== 8} = store{*u8 ~~ x, n, cast_i{u8, v}} -def b_setBatch{sz, x:(*u64), n:(ux), v & sz==16} = store{*u16 ~~ x, n, cast_i{u16,v}} -def b_setBatch{sz, x:(*u64), n:(ux), v & sz==32} = store{*u32 ~~ x, n, cast_i{u32,v}} -def b_setBatch{sz, x:(*u64), n:(ux), v & sz==64} = store{ x, n, cast_i{u64,v}} +def b_setBatch{sz, x:(*u64), n:(ux), v if sz== 8} = store{*u8 ~~ x, n, cast_i{u8, v}} +def b_setBatch{sz, x:(*u64), n:(ux), v if sz==16} = store{*u16 ~~ x, n, cast_i{u16,v}} +def b_setBatch{sz, x:(*u64), n:(ux), v if sz==32} = store{*u32 ~~ x, n, cast_i{u32,v}} +def b_setBatch{sz, x:(*u64), n:(ux), v if sz==64} = store{ x, n, cast_i{u64,v}} def spreadBits{T==[32]u8, a:(u32)} = { def idxs = iota{32} @@ -63,17 +63,17 @@ def spreadBits{T==[32]u8, a:(u32)} = { e == (d&e) } -def spreadBits{T==[16]u8, a:(u16) & hasarch{'AARCH64'}} = { +def spreadBits{T==[16]u8, a:(u16) if hasarch{'AARCH64'}} = { b:= sel{[16]u8, [16]u8~~[8]u16**a, make{[16]i8, iota{16}>=8}} andnz{b, make{[16]u8, 1<<(iota{16}&7)}} } -def spreadBits{T==[16]u8, a:(u16) & hasarch{'X86_64'}} = { +def spreadBits{T==[16]u8, a:(u16) if hasarch{'X86_64'}} = { b:= [16]u8~~[8]u16**a exp:= [16]u8~~shuf{[4]i32, shuf16Lo{mzipLo{b, b}, 4b1100}, 4b1100} (exp & make{[16]u8, 1<<(iota{16}&7)}) != [16]u8**0 } -def spreadBits{T, a & vcount{T} <= elwidth{T} & quality{eltype{T}}=='u'} = { +def spreadBits{T, a if vcount{T} <= elwidth{T} and quality{eltype{T}}=='u'} = { b:= make{T, 1<<iota{vcount{T}}} b == (b & T ~~ re_el{type{a}, T}**a) # not just T**a so that if a is read from RAM, it can use the single instruction for broadcasting from RAM; the extra bits don't matter } @@ -90,10 +90,10 @@ def loaduBit{x:(*u64), i, n} = { assert{(n<58) | (((n==58) | (n==60)) & (i%n == 0))} loaduBitRaw{x, i} } -def loaduBitTrunc{x:(*u64), i, n & knum{n}} = truncBits{n, loaduBit{x, i, n}} +def loaduBitTrunc{x:(*u64), i, n if knum{n}} = truncBits{n, loaduBit{x, i, n}} -def loadBatchBit{T, x:(*u64), is & ktup{is}} = { +def loadBatchBit{T, x:(*u64), is if ktup{is}} = { # def len = length{is} # def count = vcount{T} # assert{count*len <= 64} diff --git a/src/singeli/src/cbqnDefs.singeli b/src/singeli/src/cbqnDefs.singeli index 8aaf8026..cac96e91 100644 --- a/src/singeli/src/cbqnDefs.singeli +++ b/src/singeli/src/cbqnDefs.singeli @@ -1,13 +1,13 @@ def bcall{T, f, x} = emit{T, 'BCALL', f, x} -def from_B{T, x & T==f64} = bcall{T, 'o2fG', x} -def from_B{T, x & T<=i32 & issigned{T}} = bcall{T, 'o2iG', x} -def from_B{T, x & T<=u32 & isunsigned{T}} = bcall{T, 'o2cG', x} +def from_B{T, x if T==f64} = bcall{T, 'o2fG', x} +def from_B{T, x if T<=i32 and issigned{T}} = bcall{T, 'o2iG', x} +def from_B{T, x if T<=u32 and isunsigned{T}} = bcall{T, 'o2cG', x} def q_f64{x} = bcall{u1, 'q_f64', x} def q_chr{x} = bcall{u1, 'q_c32', x} -def q_chr{T,x & T==u8 } = bcall{u1, 'q_c8', x} -def q_chr{T,x & T==u16} = bcall{u1, 'q_c16', x} -def q_chr{T,x & T==u32} = bcall{u1, 'q_c32', x} +def q_chr{T,x if T==u8 } = bcall{u1, 'q_c8', x} +def q_chr{T,x if T==u16} = bcall{u1, 'q_c16', x} +def q_chr{T,x if T==u32} = bcall{u1, 'q_c32', x} def cbqn_c32Tag{} = emit{u64, '', 'C32_TAG'} def cbqn_tagTag{} = emit{u64, '', 'TAG_TAG'} @@ -21,18 +21,18 @@ def cbqn_nspTag{} = emit{u64, '', 'NSP_TAG'} def cbqn_objTag{} = emit{u64, '', 'OBJ_TAG'} def cbqn_arrTag{} = emit{u64, '', 'ARR_TAG'} -def cbqn_elType{T & T==u1 } = 0 -def cbqn_elType{T & T==i8 } = 1 -def cbqn_elType{T & T==i16} = 2 -def cbqn_elType{T & T==i32} = 3 -def cbqn_elType{T & T==f64} = 4 -def cbqn_elType{T & T==u8 } = 5 -def cbqn_elType{T & T==u16} = 6 -def cbqn_elType{T & T==u32} = 7 +def cbqn_elType{T if T==u1 } = 0 +def cbqn_elType{T if T==i8 } = 1 +def cbqn_elType{T if T==i16} = 2 +def cbqn_elType{T if T==i32} = 3 +def cbqn_elType{T if T==f64} = 4 +def cbqn_elType{T if T==u8 } = 5 +def cbqn_elType{T if T==u16} = 6 +def cbqn_elType{T if T==u32} = 7 def cbqn_tyArrOffset{} = emit{u64, 'offsetof', 'TyArr', 'a'} def talloc{T, len} = emit{*T, 'TALLOCP', fmt_type{T}, len} def tfree{ptr} = emit{void, 'TFREE', ptr} def fmt_type{T} = merge{quality{T}, fmtnat{width{T}}} -def fmt_type{T & isptr{T}} = merge{'*',fmt_type{eltype{T}}} +def fmt_type{T if isptr{T}} = merge{'*',fmt_type{eltype{T}}} diff --git a/src/singeli/src/clmul.singeli b/src/singeli/src/clmul.singeli index ea5f9123..c87bf1db 100644 --- a/src/singeli/src/clmul.singeli +++ b/src/singeli/src/clmul.singeli @@ -1,2 +1,2 @@ -def clmul{a:T, b:T, imm & w128i{T}} = emit{T, '_mm_clmulepi64_si128', a, b, imm} +def clmul{a:T, b:T, imm if w128i{T}} = emit{T, '_mm_clmulepi64_si128', a, b, imm} def clmul{a, b} = clmul{a, b, 0} diff --git a/src/singeli/src/cmp.singeli b/src/singeli/src/cmp.singeli index e47db6ca..04d673d6 100644 --- a/src/singeli/src/cmp.singeli +++ b/src/singeli/src/cmp.singeli @@ -16,13 +16,13 @@ fn cmpIX(dst:(*u64), len:ux, x:(u64), v:(u1)) : void = { def eqne{op} = same{op,__eq}|same{op,__ne} -def pathAS{dst, len, T, op, x & issigned{T}} = { - def R{f & eqne{op}} = { +def pathAS{dst, len, T, op, x if issigned{T}} = { + def R{f if eqne{op}} = { if (rare{floor{f}!=f}) fillbits{dst, len, op{0,1}, x} # also includes check for NaN/sNaN ftrunc{i64,f} } - def R{f & same{op,__lt}|same{op,__ge}} = ftrunc{i64,ceil{f}} - def R{f & same{op,__gt}|same{op,__le}} = ftrunc{i64,floor{f}} + def R{f if same{op,__lt} or same{op,__ge}} = ftrunc{i64,ceil{f}} + def R{f if same{op,__gt} or same{op,__le}} = ftrunc{i64,floor{f}} xf:f64 = interp_f64{x} xi64:i64 = R{xf} @@ -40,7 +40,7 @@ def pathAS{dst, len, T, op, x & issigned{T}} = { xT } -def pathAS{dst, len, T, op, x & T==f64} = { +def pathAS{dst, len, T, op, x if T==f64} = { if (rare{~q_f64{x}}) { if (~eqne{op}) if (~q_chr{x}) cmp_err{x} fillbits{dst, len, op{0,1}, x} @@ -48,7 +48,7 @@ def pathAS{dst, len, T, op, x & T==f64} = { from_B{T,x} } -def pathAS{dst, len, T, op, x & isunsigned{T}} = { +def pathAS{dst, len, T, op, x if isunsigned{T}} = { if (rare{~q_chr{x}}) { if (~eqne{op}) if (~q_f64{x}) cmp_err{x} fillbits{dst, len, op{1,0}, x} @@ -66,12 +66,12 @@ def any2bit{VT, unr, op0, wS, wV, xS, xV, dst:(*u64), len:(ux)} = { def T = eltype{VT} def op = match (op0) { - {_ & ~hasarch{'X86_64'} | hasarch{'AVX512F'}} => op0 - {(__le) & issigned{T}} => __gt - {(__ge) & issigned{T}} => __lt - {(__lt) & isunsigned{T}} => __ge - {(__gt) & isunsigned{T}} => __le - {(__ne) & isint{T}} => __eq + {_ if not hasarch{'X86_64'} or hasarch{'AVX512F'}} => op0 + {(__le) if issigned{T}} => __gt + {(__ge) if issigned{T}} => __lt + {(__lt) if isunsigned{T}} => __ge + {(__gt) if isunsigned{T}} => __le + {(__ne) if isint{T}} => __eq {_} => op0 } def mask = if (same{op0, op}) homMask else ({...x} => ~homMask{...x}) diff --git a/src/singeli/src/count.singeli b/src/singeli/src/count.singeli index 1a8ed9ad..624de946 100644 --- a/src/singeli/src/count.singeli +++ b/src/singeli/src/count.singeli @@ -4,7 +4,7 @@ include './vecfold' if_inline (hasarch{'SSE2'}) { fn sum_vec{T}(v:T) = vfold{+, fold{+, unpackQ{v, T**0}}} - def fold_addw{v:T & eltype{T}==i8} = sum_vec{T}(v) + def fold_addw{v:T if eltype{T}==i8} = sum_vec{T}(v) } def inc{ptr, ind, v} = store{ptr, ind, v + load{ptr, ind}} diff --git a/src/singeli/src/dyarith.singeli b/src/singeli/src/dyarith.singeli index e61074b7..768b81c4 100644 --- a/src/singeli/src/dyarith.singeli +++ b/src/singeli/src/dyarith.singeli @@ -6,15 +6,15 @@ include './mask' include 'util/tup' -def rootty{T & isprim{T}} = T -def rootty{T & isvec{T}} = eltype{T} +def rootty{T if isprim{T}} = T +def rootty{T if isvec{T}} = eltype{T} def is_s{X} = issigned{rootty{X}} def is_u{X} = isunsigned{rootty{X}} def ty_sc{O, R} = R # keep floats as-is -def ty_sc{O, R & is_s{O} & is_u{R}} = ty_s{R} -def ty_sc{O, R & is_u{O} & is_s{R}} = ty_u{R} +def ty_sc{O, R if is_s{O} and is_u{R}} = ty_s{R} +def ty_sc{O, R if is_u{O} and is_s{R}} = ty_u{R} def bqn_or{a, b} = (a+b)-(a*b) @@ -22,18 +22,18 @@ def bqn_or{a, b} = (a+b)-(a*b) # + & - def arithChk1{F==__add, M, w:T, x:T, r:T} = tup{'topAny', M{(w^r) & (x^r)}} def arithChk1{F==__sub, M, w:T, x:T, r:T} = tup{'topAny', M{(w^x) & (w^r)}} -def arithChk1{F==__add, M, w:T, x:T, r:T & isvec{T} & tern{hasarch{'X86_64'}, elwidth{T}<=16, 1}} = tup{'anyne', adds{w,x}, r} -def arithChk1{F==__sub, M, w:T, x:T, r:T & isvec{T} & tern{hasarch{'X86_64'}, elwidth{T}<=16, 1}} = tup{'anyne', subs{w,x}, r} +def arithChk1{F==__add, M, w:T, x:T, r:T if isvec{T} and tern{hasarch{'X86_64'}, elwidth{T}<=16, 1}} = tup{'anyne', adds{w,x}, r} +def arithChk1{F==__sub, M, w:T, x:T, r:T if isvec{T} and tern{hasarch{'X86_64'}, elwidth{T}<=16, 1}} = tup{'anyne', subs{w,x}, r} -def arithChk2{F, M, w:T, x:T & is_s{T} & (same{F,__add} | same{F,__sub})} = { +def arithChk2{F, M, w:T, x:T if is_s{T} and (same{F,__add} or same{F,__sub})} = { r:= F{w,x} tup{r, arithChk1{F, M, w, x, r}} } # × -def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & i8==eltype{T} & hasarch{'X86_64'}} = { +def arithChk2{F, M, w:T, x:T if same{F,__mul} and isvec{T} and i8==eltype{T} and hasarch{'X86_64'}} = { def wp = unpackQ{w, T ~~ (T**0 > w)} def xp = unpackQ{x, T ~~ (T**0 > x)} def rp = each{__mul, wp, xp} @@ -46,12 +46,12 @@ def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & i8==eltype{T} & hasarc tup{packQ{rp}, tup{'~andAllZero', RU~~tree_fold{|, bad}, RU**0xff80}} } } -def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & i16==eltype{T} & hasarch{'X86_64'}} = { +def arithChk2{F, M, w:T, x:T if same{F,__mul} and isvec{T} and i16==eltype{T} and hasarch{'X86_64'}} = { rl:= __mul{w,x} rh:= mulh{w,x} tup{rl, tup{'anyne', rh, rl>>15}} } -def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & i32==eltype{T} & hasarch{'X86_64'}} = { +def arithChk2{F, M, w:T, x:T if same{F,__mul} and isvec{T} and i32==eltype{T} and hasarch{'X86_64'}} = { max:= re_el{f32, (ty_u{T})**0x4efffffe} def cf32{x:X} = emit{re_el{f32,X}, tern{T==[8]i32, '_mm256_cvtepi32_ps', '_mm_cvtepi32_ps'}, x} f32mul:= cf32{w} * cf32{x} @@ -68,7 +68,7 @@ def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & i32==eltype{T} & hasar # tup{packQQ{each{{v} => v & T2**0xFFFFFFFF, rp}}, tup{'homAny', tree_fold{|,bad}}} this doesn't use M } -def arithChk2{F, M, w:T, x:T & same{F,__mul} & isvec{T} & hasarch{'AARCH64'}} = { +def arithChk2{F, M, w:T, x:T if same{F,__mul} and isvec{T} and hasarch{'AARCH64'}} = { def r12 = mulw{w, x} rl:= packLo{r12} rh:= packHi{r12} @@ -82,24 +82,24 @@ def runner{u, R, F} = { def run{F, M, w, x} = { show{'todo', c, R, F, w, x}; emit{void,'__builtin_abort'}; w } - def run{F, M, w:T, x:T & c & R!=u32} = { + def run{F, M, w:T, x:T if c and R!=u32} = { arithChk2{F, M, w, x} } - def run{F, M, w, x & u} = tup{F{w, x}, tup{'none'}} # trivial base implementation + def run{F, M, w, x if u} = tup{F{w, x}, tup{'none'}} # trivial base implementation def toggleTop{x:X} = x ^ X**(1<<(elwidth{X}-1)) - def run{F==__sub, M, w:VU, x:VU & c & is_u{VU}} = { # 'b'-'a' + def run{F==__sub, M, w:VU, x:VU if c and is_u{VU}} = { # 'b'-'a' def VS = ty_s{VU} run{F, M, VS~~toggleTop{w}, VS~~toggleTop{x}} } - def run{F, M, w:VU, x:VS & c & is_u{VU} & is_s{VS}} = { # 'a'+3, 'a'-3 + def run{F, M, w:VU, x:VS if c and is_u{VU} and is_s{VS}} = { # 'a'+3, 'a'-3 def {res, ok} = run{F, M, VS~~toggleTop{w}, x} tup{toggleTop{VU~~res}, ok} } - def run{F==__add, M, w:VS, x:VU & c & is_s{VS} & is_u{VU}} = run{F, M, x, w} # 3+'a' → 'a'+3 + def run{F==__add, M, w:VS, x:VU if c and is_s{VS} and is_u{VU}} = run{F, M, x, w} # 3+'a' → 'a'+3 - def run{F, M, w:VW, x:VX & c & R==u32 & (same{F,__add} | same{F,__sub})} = { # 'a'+1, 'a'-1 + def run{F, M, w:VW, x:VX if c and R==u32 and (same{F,__add} | same{F,__sub})} = { # 'a'+1, 'a'-1 r:= F{ty_u{w}, ty_u{x}} tup{re_el{R, VW}~~r, tup{'homAny', M{r > type{r}**1114111}}} } @@ -111,7 +111,7 @@ def runChecks_any{F, vals} = { F{tree_fold{|, each{select{.,1}, vals}}} } def runChecks{type=='homAny', vals, M} = runChecks_any{homAny, vals} def runChecks{type=='topAny', vals, M} = runChecks_any{topAny, vals} def runChecks{type=='none', vals, M} = 0 -def runChecks{type=='~andAllZero', vals, M & ~M{0}} = ~tree_fold{&, each{andAllZero, ...slice{flip{vals}, 1}}} +def runChecks{type=='~andAllZero', vals, M if ~M{0}} = ~tree_fold{&, each{andAllZero, ...slice{flip{vals}, 1}}} def runChecks{type=='anyne', vals, M} = { def i{vals} = { def {_,xs,ys} = flip{vals} @@ -182,7 +182,7 @@ fn arithSAf{vw, mode, F, swap, W, X, R}(r:*void, w:u64, x:*void, len:u64) : u64 def run = runner{(R==f64) | (mode>=2), R, F} def getW{v} = trunc{W, v} - def getW{v & W==f64} = interp_f64{v} + def getW{v if W==f64} = interp_f64{v} cw:= ty_sc{W, TY}**getW{w} def unr = tern{mode>=2, 2, 1} # same as in arithAAimpl diff --git a/src/singeli/src/equal.singeli b/src/singeli/src/equal.singeli index ade621f3..b80eb02d 100644 --- a/src/singeli/src/equal.singeli +++ b/src/singeli/src/equal.singeli @@ -39,9 +39,9 @@ fn equal{W, X}(w:*void, x:*void, l:u64, d:u64) : u1 = { } else { # bitarr ≡ i8/i16/i32arr def T = [bulk]X def sh{c} = c << (width{X}-1) - def sh{c & X==u8} = T ~~ (re_el{u16,c}<<7) - def mask{x:X & hasarch{'X86_64'}} = topMask{x} - def mask{x:X & hasarch{'AARCH64'}} = homMask{andnz{x, ~T**0}} + def sh{c if X==u8} = T ~~ (re_el{u16,c}<<7) + def mask{x:X if hasarch{'X86_64'}} = topMask{x} + def mask{x:X if hasarch{'AARCH64'}} = homMask{andnz{x, ~T**0}} # TODO compare with doing the comparison in vector registers badBits:= T ** ~(X~~1) diff --git a/src/singeli/src/f64.singeli b/src/singeli/src/f64.singeli index 00dda936..62107337 100644 --- a/src/singeli/src/f64.singeli +++ b/src/singeli/src/f64.singeli @@ -6,10 +6,10 @@ def NaN = 0.0/0.0 def isNaN{x:(f64)} = x!=x def qNaN{x:(u64)} = (x<<1) == (cast{u64, 0x8ff8} << 49) -def ftrunc{T, x:(f64) & i8==T} = emit{i8, '', x} -def ftrunc{T, x:(f64) & i16==T} = emit{i16, '', x} -def ftrunc{T, x:(f64) & i32==T} = emit{i32, '', x} # maybe explicitly use _mm_cvtsd_si32? -def ftrunc{T, x:(f64) & i64==T} = emit{i64, '', x} +def ftrunc{T== i8, x:(f64)} = emit{T, '', x} +def ftrunc{T==i16, x:(f64)} = emit{T, '', x} +def ftrunc{T==i32, x:(f64)} = emit{T, '', x} # maybe explicitly use _mm_cvtsd_si32? +def ftrunc{T==i64, x:(f64)} = emit{T, '', x} def fext{x} = emit{f64, '', x} def interp_f64{x:(u64)} = emit{f64, 'interp_f64', x} diff --git a/src/singeli/src/fold.singeli b/src/singeli/src/fold.singeli index bb132d4e..1f262be5 100644 --- a/src/singeli/src/fold.singeli +++ b/src/singeli/src/fold.singeli @@ -3,8 +3,8 @@ include './mask' def opsh64{op}{v:([4]f64), perm} = op{v, shuf{[4]u64, v, perm}} def opsh32{op}{v:([2]f64), perm} = op{v, shuf{[4]u32, v, perm}} -def mix{op, v:([4]f64) & hasarch{'AVX'}} = { def sh=opsh64{op}; sh{sh{v, 4b2301}, 4b1032} } -def mix{op, v:([2]f64) & hasarch{'X86_64'}} = opsh32{op}{v, 4b1032} +def mix{op, v:([4]f64) if hasarch{'AVX'}} = { def sh=opsh64{op}; sh{sh{v, 4b2301}, 4b1032} } +def mix{op, v:([2]f64) if hasarch{'X86_64'}} = opsh32{op}{v, 4b1032} def reduce_pairwise{op, plog, x:*T, len, init:T} = { # Pairwise combination to shorten dependency chains diff --git a/src/singeli/src/hashtab.singeli b/src/singeli/src/hashtab.singeli index 3a2556fd..502fc1a2 100644 --- a/src/singeli/src/hashtab.singeli +++ b/src/singeli/src/hashtab.singeli @@ -29,7 +29,7 @@ def hash_val{x0:(u64)} = { } # CRC32 if_inline (hasarch{'SSE4.2'}) require{'x86intrin.h'} -def hash_val{x:(u32) & hasarch{'SSE4.2'}} = { +def hash_val{x:(u32) if hasarch{'SSE4.2'}} = { emit{u32, '_mm_crc32_u32', 0x973afb51, x} } diff --git a/src/singeli/src/mask.singeli b/src/singeli/src/mask.singeli index 598bb439..11433d84 100644 --- a/src/singeli/src/mask.singeli +++ b/src/singeli/src/mask.singeli @@ -3,25 +3,25 @@ local def maskInit1{w} = { merge{(w/8-1)**255, (1<<x)-1, (w/8)**0} }, iota{8}}} } -mask256_1:*u8 = maskInit1{256}; def maskOfBit{T,n & width{T}==256} = load{*[32]u8 ~~ (mask256_1 + (n>>3)^31 + 64*(n&7))} -mask128_1:*u8 = maskInit1{128}; def maskOfBit{T,n & width{T}==128} = load{*[16]u8 ~~ (mask128_1 + (n>>3)^15 + 32*(n&7))} +mask256_1:*u8 = maskInit1{256}; def maskOfBit{T,n if width{T}==256} = load{*[32]u8 ~~ (mask256_1 + (n>>3)^31 + 64*(n&7))} +mask128_1:*u8 = maskInit1{128}; def maskOfBit{T,n if width{T}==128} = load{*[16]u8 ~~ (mask128_1 + (n>>3)^15 + 32*(n&7))} mask256:*i64 = merge{4 ** -1, 4 ** 0} local def maskOfImpl{T, n, w} = load{*ty_u{T} ~~ (*u8~~mask256 + 32 - n*(elwidth{T}/8))} # get homogeneous mask of first n items; 0 ≤ n ≤ vcount{T} -def maskOf{T,n & width{T}==256} = maskOfImpl{T, n, 256} -def maskOf{T,n & width{T}==128} = maskOfImpl{T, n, 128} -def maskOf{T,n & width{T}== 64} = maskOfImpl{T, n, 64} - -def anyne{x:T, y:T, M & M{0}==0 & isvec{T}} = ~homAll{x==y} -def anyne{x:T, y:T, M & M{0}==1 & isvec{T}} = homAny{M{x!=y}} -def anyne{x:T, y:T, M & M{0}==0 & anyInt{x}} = x!=y -def anyne{x:T, y:T, M & M{0}==1 & anyInt{x}} = M{x^y} != 0 +def maskOf{T,n if width{T}==256} = maskOfImpl{T, n, 256} +def maskOf{T,n if width{T}==128} = maskOfImpl{T, n, 128} +def maskOf{T,n if width{T}== 64} = maskOfImpl{T, n, 64} + +def anyne{x:T, y:T, M if M{0}==0 and isvec{T}} = ~homAll{x==y} +def anyne{x:T, y:T, M if M{0}==1 and isvec{T}} = homAny{M{x!=y}} +def anyne{x:T, y:T, M if M{0}==0 and anyInt{x}} = x!=y +def anyne{x:T, y:T, M if M{0}==1 and anyInt{x}} = M{x^y} != 0 def anyneBit{x:T, y:T, M} = ~M{x^y, 'all bits zeroes'} -def anynePositive{x:T, y:T, M & M{0}==0} = anyne{x, y, M} -def anynePositive{x:T, y:T, M & M{0}==1 & isvec{T}} = { +def anynePositive{x:T, y:T, M if M{0}==0} = anyne{x, y, M} +def anynePositive{x:T, y:T, M if M{0}==1 and isvec{T}} = { def {n,m} = homMaskX{x==y} def E = tern{type{m}==u64, u64, u32} (promote{E,~m} << (width{E}-M{'count'}*n)) != 0 @@ -30,8 +30,8 @@ def anynePositive{x:T, y:T, M & M{0}==1 & isvec{T}} = { def maskNone{x} = x def maskNone{x, mode=='all bits zeroes'} = andAllZero{x, x} def maskAfter{n} = { - def mask{x:X & isvec{X}} = x & (X~~maskOf{X,n}) - def mask{x:X & anyInt{x}} = x & ((1<<n) - 1) + def mask{x:X if isvec{X}} = x & (X~~maskOf{X,n}) + def mask{x:X if anyInt{x}} = x & ((1<<n) - 1) def mask{x:X, mode=='all bits zeroes'} = andAllZero{x, X~~maskOfBit{X,n}} def mask{X, mode=='to sign bits'} = maskOf{X,n} def mask{X, mode=='to homogeneous bits'} = maskOf{X,n} @@ -42,7 +42,7 @@ def maskAfter{n} = { -def loadLowBatch{T, ptr:P, w, n & eltype{P}==eltype{T}} = loadLow{*T ~~ (ptr + n*(w/elwidth{P})), w} +def loadLowBatch{T, ptr:P, w, n if eltype{P}==eltype{T}} = loadLow{*T ~~ (ptr + n*(w/elwidth{P})), w} # store vcount{T} items into the n'th batch of ptr elements, compressing the items if needed; masked by M def storeBatch{ptr:P, n, x:T, M} = { @@ -63,8 +63,8 @@ def loadBatch{ptr:P, n, T} = { widen{T, loadLow{*re_el{E0, T} ~~ rpos, vcount{T}*width{E0}}} } -def loadBatch {ptr:P, ns, T & istup{ns}} = each{{n } => loadBatch {ptr, n, T }, ns} -def storeBatch{ptr:P, ns, xs, M & istup{ns}} = each{{n,x} => storeBatch{ptr, n, x, M}, ns, xs} +def loadBatch {ptr:P, ns, T if istup{ns}} = each{{n } => loadBatch {ptr, n, T }, ns} +def storeBatch{ptr:P, ns, xs, M if istup{ns}} = each{{n,x} => storeBatch{ptr, n, x, M}, ns, xs} @@ -74,7 +74,7 @@ def hCast{T,p:*T} = p def hCast{T,p:(*void)} = *T~~p def mlExec{i, iter, vars0, bulk, M} = { - def vproc{p:P & isptr{P}} = p + def vproc{p:P if isptr{P}} = p def vproc{('m')} = tptr{{_}=>M, '!'} def vproc{{T,p:P}} = tptr{{i} => loadBatch{p, i, T}, {i,x} => storeBatch{p, i, x, M}} diff --git a/src/singeli/src/neon.singeli b/src/singeli/src/neon.singeli index dc773fc8..70933be5 100644 --- a/src/singeli/src/neon.singeli +++ b/src/singeli/src/neon.singeli @@ -1,157 +1,157 @@ def nvec{T} = 0 -def nvec{T & isvec{T}} = (width{T}==64) | (width{T}==128) +def nvec{T if isvec{T}} = (width{T}==64) | (width{T}==128) def nvec{T,w} = 0 -def nvec{T,w & nvec{T}} = elwidth{T}==w +def nvec{T,w if nvec{T}} = elwidth{T}==w def nveci = genchk{nvec, isint} def nvecs = genchk{nvec, issigned} def nvecu = genchk{nvec, isunsigned} def nvecf = genchk{nvec, isfloat} -def reinterpret{T, v & same{'pointer',typekind{T}} & ktup{v}} = { tmp:T=v } +def reinterpret{T, v if same{'pointer',typekind{T}} and ktup{v}} = { tmp:T=v } def nty{T} = { def q = quality{T} merge{if (q=='i') 's' else q, fmtnat{width{T}}} } -def nty{T & isvec{T}} = nty{eltype{T}} -def ntyp{S, ...S2, T & w128{T}} = merge{S, 'q', ...S2, '_', nty{T}} -def ntyp{S, ...S2, T & w64{T}} = merge{S, ...S2, '_', nty{T}} +def nty{T if isvec{T}} = nty{eltype{T}} +def ntyp{S, ...S2, T if w128{T}} = merge{S, 'q', ...S2, '_', nty{T}} +def ntyp{S, ...S2, T if w64{T}} = merge{S, ...S2, '_', nty{T}} def ntyp0{S, T} = merge{S, '_', nty{T}} -def addwLo{a:T,b:T & w64i{T}} = emit{el_d{T}, ntyp{'vaddl', T}, a, b} -def subwLo{a:T,b:T & w64i{T}} = emit{el_d{T}, ntyp{'vsubl', T}, a, b} -def mulwLo{a:T,b:T & w64i{T}} = emit{el_d{T}, ntyp{'vmull', T}, a, b} -def mulwHi{a:T,b:T & w128i{T}} = emit{el_m{T}, ntyp0{'vmull_high', T}, a, b} -def mulw {a:T,b:T & w128{T}} = tup{mulwLo{half{a,0}, half{b,0}}, mulwHi{a,b}} - -def shrn{a:T, s & w128i{T} & elwidth{T}>8} = { def H=el_h{T}; emit{H, ntyp0{'vshrn_n', T}, a, s} } # a>>s, narrowed -def shrm{a:T, s, d:T & nvecu{T}} = emit{T, ntyp{'vsri', '_n', T}, d, a, s} # (a>>s) | (d & (mask of new zeroes)) -def shlm{a:T, s, d:T & nvecu{T}} = emit{T, ntyp{'vsli', '_n', T}, d, a, s} # (a<<s) | (d & (mask of new zeroes)) - -def bitBlend{f:T, t:T, m:M & nvec{T} & nvecu{M,elwidth{T}} & width{T}==width{M}} = emit{T, ntyp{'vbsl', T}, m, t, f} -def homBlend{f:T, t:T, m:M & nvec{M}} = bitBlend{f, t, m} - -def addpw { x:T & nveci{T} & elwidth{T}<=32 } = emit{el_m{T}, ntyp{'vpaddl', T}, x} # add pairwise widening -def addpwa{a:D, x:T & nveci{T} & elwidth{T}<=32 & D==el_m{T}} = emit{D, ntyp{'vpadal', T}, a, x} # add pairwise widening + accumulate -def mla{a:T, x:T, y:T & nvec{T}} = emit{T, ntyp{'vmla', T}, a, x, y} # a + x*y -def mls{a:T, x:T, y:T & nvec{T}} = emit{T, ntyp{'vmls', T}, a, x, y} # a - x*y -def rbit{x:T & nvecu{T,8}} = emit{T, ntyp{'vrbit', T}, x} -def rev{w, x:T & w==elwidth{T}} = x -def rev{w==16, x:T & elwidth{T}<16} = emit{T, ntyp{'vrev16', T}, x} # reverse the order of elements in each w-bit window -def rev{w==32, x:T & elwidth{T}<32} = emit{T, ntyp{'vrev32', T}, x} -def rev{w==64, x:T & elwidth{T}<64} = emit{T, ntyp{'vrev64', T}, x} -def popc{x:T & nvecu{T,8}} = emit{T, ntyp{'vcnt', T}, x} -def clz{x:T & nvecu{T} & elwidth{T}<=32} = emit{T, ntyp{'vclz', T}, x} -def cls{x:T & nveci{T} & elwidth{T}<=32} = ty_u{T}~~emit{ty_s{T}, ntyp{'vcls', T}, x} - -def fold_add {a:T & nvec{T}} = emit{eltype{T}, ntyp{'vaddv', T}, a} -def fold_addw{a:T & nveci{T}} = emit{w_d{eltype{T}}, ntyp{'vaddlv', T}, a} -def fold_min {a:T & nvec{T} & ~nveci{T,64}} = emit{eltype{T}, ntyp{'vminv', T}, a} -def fold_max {a:T & nvec{T} & ~nveci{T,64}} = emit{eltype{T}, ntyp{'vmaxv', T}, a} -def vfold{F, x:T & nvec{T} & ~nveci{T,64} & same{F, min}} = fold_min{x} -def vfold{F, x:T & nvec{T} & ~nveci{T,64} & same{F, max}} = fold_max{x} -def vfold{F, x:T & nvec{T} & same{F, +}} = fold_add{x} - -def storeLow{ptr:P, w, x:T & nvec{T} & w<=64} = { def E=ty_u{w}; storeu{*E~~ptr, extract{re_el{E,T}~~x, 0}} } -def storeLow{ptr:P, w, x:T & nvec{T} & w==width{T}} = store{*T~~ptr, 0, x} - -def loadLow{ptr:P, w & w<=64} = { # a broadcast load +def addwLo{a:T,b:T if w64i{T}} = emit{el_d{T}, ntyp{'vaddl', T}, a, b} +def subwLo{a:T,b:T if w64i{T}} = emit{el_d{T}, ntyp{'vsubl', T}, a, b} +def mulwLo{a:T,b:T if w64i{T}} = emit{el_d{T}, ntyp{'vmull', T}, a, b} +def mulwHi{a:T,b:T if w128i{T}} = emit{el_m{T}, ntyp0{'vmull_high', T}, a, b} +def mulw {a:T,b:T if w128{T}} = tup{mulwLo{half{a,0}, half{b,0}}, mulwHi{a,b}} + +def shrn{a:T, s if w128i{T} and elwidth{T}>8} = { def H=el_h{T}; emit{H, ntyp0{'vshrn_n', T}, a, s} } # a>>s, narrowed +def shrm{a:T, s, d:T if nvecu{T}} = emit{T, ntyp{'vsri', '_n', T}, d, a, s} # (a>>s) | (d & (mask of new zeroes)) +def shlm{a:T, s, d:T if nvecu{T}} = emit{T, ntyp{'vsli', '_n', T}, d, a, s} # (a<<s) | (d & (mask of new zeroes)) + +def bitBlend{f:T, t:T, m:M if nvec{T} and nvecu{M,elwidth{T}} and width{T}==width{M}} = emit{T, ntyp{'vbsl', T}, m, t, f} +def homBlend{f:T, t:T, m:M if nvec{M}} = bitBlend{f, t, m} + +def addpw { x:T if nveci{T} and elwidth{T}<=32 } = emit{el_m{T}, ntyp{'vpaddl', T}, x} # add pairwise widening +def addpwa{a:D, x:T if nveci{T} and elwidth{T}<=32 and D==el_m{T}} = emit{D, ntyp{'vpadal', T}, a, x} # add pairwise widening + accumulate +def mla{a:T, x:T, y:T if nvec{T}} = emit{T, ntyp{'vmla', T}, a, x, y} # a + x*y +def mls{a:T, x:T, y:T if nvec{T}} = emit{T, ntyp{'vmls', T}, a, x, y} # a - x*y +def rbit{x:T if nvecu{T,8}} = emit{T, ntyp{'vrbit', T}, x} +def rev{w, x:T if w==elwidth{T}} = x +def rev{w==16, x:T if elwidth{T}<16} = emit{T, ntyp{'vrev16', T}, x} # reverse the order of elements in each w-bit window +def rev{w==32, x:T if elwidth{T}<32} = emit{T, ntyp{'vrev32', T}, x} +def rev{w==64, x:T if elwidth{T}<64} = emit{T, ntyp{'vrev64', T}, x} +def popc{x:T if nvecu{T,8}} = emit{T, ntyp{'vcnt', T}, x} +def clz{x:T if nvecu{T} and elwidth{T}<=32} = emit{T, ntyp{'vclz', T}, x} +def cls{x:T if nveci{T} and elwidth{T}<=32} = ty_u{T}~~emit{ty_s{T}, ntyp{'vcls', T}, x} + +def fold_add {a:T if nvec{T}} = emit{eltype{T}, ntyp{'vaddv', T}, a} +def fold_addw{a:T if nveci{T}} = emit{w_d{eltype{T}}, ntyp{'vaddlv', T}, a} +def fold_min {a:T if nvec{T} and ~nveci{T,64}} = emit{eltype{T}, ntyp{'vminv', T}, a} +def fold_max {a:T if nvec{T} and ~nveci{T,64}} = emit{eltype{T}, ntyp{'vmaxv', T}, a} +def vfold{F, x:T if nvec{T} and ~nveci{T,64} and same{F, min}} = fold_min{x} +def vfold{F, x:T if nvec{T} and ~nveci{T,64} and same{F, max}} = fold_max{x} +def vfold{F, x:T if nvec{T} and same{F, +}} = fold_add{x} + +def storeLow{ptr:P, w, x:T if nvec{T} and w<=64} = { def E=ty_u{w}; storeu{*E~~ptr, extract{re_el{E,T}~~x, 0}} } +def storeLow{ptr:P, w, x:T if nvec{T} and w==width{T}} = store{*T~~ptr, 0, x} + +def loadLow{ptr:P, w if w<=64} = { # a broadcast load def T=eltype{P} def L=re_el{ty_u{w}, T} T ~~ emit{L, ntyp{'vld1', '_dup', L}, *ty_u{w}~~ptr} } -def loadLow{ptr:P, w & w==elwidth{P}} = load{ptr} +def loadLow{ptr:P, w if w==elwidth{P}} = load{ptr} -def undefPromote{T, x:X & w64{X} & w128{T} & eltype{T}==eltype{X}} = emit{T, ntyp{'vcombine', X}, x, x} # arm_neon.h doesn't actually provide a way to do this in a 0-instruction way. ¯\_(ツ)_/¯ -def half{x:T, n==0 & w128{T}} = emit{n_h{T}, ntyp0{'vget_low', T}, x} -def half{x:T, n==1 & w128{T}} = emit{n_h{T}, ntyp0{'vget_high', T}, x} -def pair{a:T, b:T & w64{T}} = emit{n_d{T}, ntyp0{'vcombine', T}, a, b} -def copyLane{dst:D, di, src:S, si & w64{D} & nvec{S} & eltype{D}==eltype{S}} = emit{D, ntyp{'vcopy_lane', S}, dst, di, src, si} -def copyLane{dst:D, di, src:S, si & w128{D} & nvec{S} & eltype{D}==eltype{S}} = emit{D, ntyp{'vcopyq_lane', S}, dst, di, src, si} -def broadcastSel{x:T, i & nvec{T}} = emit{T, ntyp{'vdup', tern{w128{T},'_laneq','_lane'}, T}, x, i} -def vshl{a:T, b:T, n & knum{n}} = emit{T, ntyp{'vext', T}, a, b, n} +def undefPromote{T, x:X if w64{X} and w128{T} and eltype{T}==eltype{X}} = emit{T, ntyp{'vcombine', X}, x, x} # arm_neon.h doesn't actually provide a way to do this in a 0-instruction way. ¯\_(ツ)_/¯ +def half{x:T, n==0 if w128{T}} = emit{n_h{T}, ntyp0{'vget_low', T}, x} +def half{x:T, n==1 if w128{T}} = emit{n_h{T}, ntyp0{'vget_high', T}, x} +def pair{a:T, b:T if w64{T}} = emit{n_d{T}, ntyp0{'vcombine', T}, a, b} +def copyLane{dst:D, di, src:S, si if w64{D} and nvec{S} and eltype{D}==eltype{S}} = emit{D, ntyp{'vcopy_lane', S}, dst, di, src, si} +def copyLane{dst:D, di, src:S, si if w128{D} and nvec{S} and eltype{D}==eltype{S}} = emit{D, ntyp{'vcopyq_lane', S}, dst, di, src, si} +def broadcastSel{x:T, i if nvec{T}} = emit{T, ntyp{'vdup', tern{w128{T},'_laneq','_lane'}, T}, x, i} +def vshl{a:T, b:T, n if knum{n}} = emit{T, ntyp{'vext', T}, a, b, n} -def zipLo{a:T, b:T & nvec{T}} = emit{T, ntyp{'vzip1', T}, a, b} -def zipHi{a:T, b:T & nvec{T}} = emit{T, ntyp{'vzip2', T}, a, b} -def zip{a:T, b:T & nvec{T}} = tup{zipLo{a,b}, zipHi{a,b}} +def zipLo{a:T, b:T if nvec{T}} = emit{T, ntyp{'vzip1', T}, a, b} +def zipHi{a:T, b:T if nvec{T}} = emit{T, ntyp{'vzip2', T}, a, b} +def zip{a:T, b:T if nvec{T}} = tup{zipLo{a,b}, zipHi{a,b}} -def packLo{x:T, y:T & nvec{T}} = { def H=el_s{T}; emit{H, ntyp{'vuzp1', H}, H~~x, H~~y} } -def packHi{x:T, y:T & nvec{T}} = { def H=el_s{T}; emit{H, ntyp{'vuzp2', H}, H~~x, H~~y} } +def packLo{x:T, y:T if nvec{T}} = { def H=el_s{T}; emit{H, ntyp{'vuzp1', H}, H~~x, H~~y} } +def packHi{x:T, y:T if nvec{T}} = { def H=el_s{T}; emit{H, ntyp{'vuzp2', H}, H~~x, H~~y} } def packLo{{x, y}} = packLo{x, y} def packHi{{x, y}} = packHi{x, y} -def trn1{x:T, y:T & nvec{T}} = emit{T, ntyp{'vtrn1', T}, x, y} -def trn2{x:T, y:T & nvec{T}} = emit{T, ntyp{'vtrn2', T}, x, y} +def trn1{x:T, y:T if nvec{T}} = emit{T, ntyp{'vtrn1', T}, x, y} +def trn2{x:T, y:T if nvec{T}} = emit{T, ntyp{'vtrn2', T}, x, y} -def sel{L, x:T, i:I & lvec{L,16,8} & w128{T} & nvec{I, 8}} = re_el{eltype{T}, emit{I, ntyp{'vqtbl1',I}, re_el{eltype{I},x}, ty_u{i}}} +def sel{L, x:T, i:I if lvec{L,16,8} and w128{T} and nvec{I, 8}} = re_el{eltype{T}, emit{I, ntyp{'vqtbl1',I}, re_el{eltype{I},x}, ty_u{i}}} local def eqqi{A, B} = isint{A} & (quality{A}==quality{B}) # equal quality integers -def cvt{T==f64, x:X & nveci{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_f64', X}, x} -def cvt{T==i64, x:X & nvecf{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_s64', X}, x} -def cvt{T==u64, x:X & nvecf{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_u64', X}, x} +def cvt{T==f64, x:X if nveci{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_f64', X}, x} +def cvt{T==i64, x:X if nvecf{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_s64', X}, x} +def cvt{T==u64, x:X if nvecf{X,64}} = emit{[vcount{X}]T, ntyp{'vcvt', '_u64', X}, x} -def widen{T, x:X & w64{X} & eqqi{eltype{T},eltype{X}} & elwidth{T}==elwidth{X}*2} = emit{T, ntyp{'vmovl', X}, x} -def widen{T, x:X & w64{X} & eqqi{eltype{T},eltype{X}} & elwidth{T}> elwidth{X}*2} = widen{T, widen{el_s{T}, x}} -def widen{T, x:X & w64{X} & isfloat{eltype{T}}!=isfloat{eltype{X}} & elwidth{T}>elwidth{X}} = cvt{eltype{T}, widen{[vcount{T}](to_w{eltype{X},elwidth{T}}), x}} -def widen{T, x:X & w128{X} & vcount{X}>vcount{T}} = widen{T, half{x,0}} +def widen{T, x:X if w64{X} and eqqi{eltype{T},eltype{X}} and elwidth{T}==elwidth{X}*2} = emit{T, ntyp{'vmovl', X}, x} +def widen{T, x:X if w64{X} and eqqi{eltype{T},eltype{X}} and elwidth{T}> elwidth{X}*2} = widen{T, widen{el_s{T}, x}} +def widen{T, x:X if w64{X} and isfloat{eltype{T}}!=isfloat{eltype{X}} and elwidth{T}>elwidth{X}} = cvt{eltype{T}, widen{[vcount{T}](to_w{eltype{X},elwidth{T}}), x}} +def widen{T, x:X if w128{X} and vcount{X}>vcount{T}} = widen{T, half{x,0}} -def narrow{T, x:X & w128{X} & eqqi{T,eltype{X}} & width{T}*2< elwidth{X}} = narrow{T, undefPromote{el_s{X}, narrow{w_h{eltype{X}}, x}}} -def narrow{T, x:X & w128{X} & eqqi{T,eltype{X}} & width{T}*2==elwidth{X}} = emit{el_h{X}, ntyp0{'vmovn', X}, x} -def narrow{T, x:X & w128{X} & isfloat{T}!=isfloat{eltype{X}} & width{T}<elwidth{X}} = narrow{T, cvt{to_w{T, elwidth{X}}, x}} +def narrow{T, x:X if w128{X} and eqqi{T,eltype{X}} and width{T}*2< elwidth{X}} = narrow{T, undefPromote{el_s{X}, narrow{w_h{eltype{X}}, x}}} +def narrow{T, x:X if w128{X} and eqqi{T,eltype{X}} and width{T}*2==elwidth{X}} = emit{el_h{X}, ntyp0{'vmovn', X}, x} +def narrow{T, x:X if w128{X} and isfloat{T}!=isfloat{eltype{X}} and width{T}<elwidth{X}} = narrow{T, cvt{to_w{T, elwidth{X}}, x}} -def narrowUpper{lowRes:L, x:X & w64i{L} & w128{X} & el_d{L}==X} = emit{[vcount{L}*2](eltype{L}), ntyp0{'vmovn_high', X}, lowRes, x} +def narrowUpper{lowRes:L, x:X if w64i{L} and w128{X} and el_d{L}==X} = emit{[vcount{L}*2](eltype{L}), ntyp0{'vmovn_high', X}, lowRes, x} def narrowPair{a:T, b:T} = narrowUpper{narrow{w_h{eltype{T}}, a}, b} -def narrowPair{a:T, b:T & isint{eltype{T}}} = packLo{a, b} +def narrowPair{a:T, b:T if isint{eltype{T}}} = packLo{a, b} -def widenUpper{x:T & w128i{T}} = emit{el_m{T}, ntyp0{'vmovl_high', T}, x} -def widen{x:T & w128{T}} = tup{widen{el_m{T}, x}, widenUpper{x}} +def widenUpper{x:T if w128i{T}} = emit{el_m{T}, ntyp0{'vmovl_high', T}, x} +def widen{x:T if w128{T}} = tup{widen{el_m{T}, x}, widenUpper{x}} def bitAny{x:T} = fold_max{re_el{u32, x}}!=0 def bitAll{x:T} = fold_min{re_el{u32, x}}==0xffff_ffff -def topAny{x:T & nvec{T}} = fold_min{ty_s{x}}<0 -def topAll{x:T & nvec{T}} = fold_max{ty_s{x}}<0 -def homAny{x:T & nvec{T}} = bitAny{x} -def homAll{x:T & nvec{T}} = bitAll{x} +def topAny{x:T if nvec{T}} = fold_min{ty_s{x}}<0 +def topAll{x:T if nvec{T}} = fold_max{ty_s{x}}<0 +def homAny{x:T if nvec{T}} = bitAny{x} +def homAll{x:T if nvec{T}} = bitAll{x} -def homMask{x:T & nvecu{T} & elwidth{T}>=vcount{T}} = { +def homMask{x:T if nvecu{T} and elwidth{T}>=vcount{T}} = { truncBits{vcount{T}, fold_add{x & make{T, 1<<iota{vcount{T}}}}} } -def homMask{x:T & nvecu{T} & T==[16]u8} = { +def homMask{x:T if nvecu{T} and T==[16]u8} = { t:= [8]u16~~sel{[16]u8, x, make{[16]u8, 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15}} fold_add{t & make{[8]u16, (1<<iota{8})*0x0101}} } -def homMask{a:T,b:T & T==[16]u8} = { +def homMask{a:T,b:T if T==[16]u8} = { m:= make{[16]u8, 1<<(iota{16}&7)} fold_add{addpw{addpw{addp{a&m, b&m}}}<<make{[4]u32,iota{4}*8}} } -def homMask{a:T,b:T,c:T,d:T & T==[16]u8} = { +def homMask{a:T,b:T,c:T,d:T if T==[16]u8} = { m:= make{[16]u8, 1<<(iota{16}&7)} t1:= addp{a&m, b&m} t2:= addp{c&m, d&m} t3:= addp{t1, t2} extract{[2]u64~~addp{t3,t3},0} } -def homMask{...as & length{as}>1 & elwidth{type{select{as,0}}}>=32} = homMask{...each{{i}=>narrowPair{select{as,i*2},select{as,i*2+1}}, iota{length{as}/2}}} -def homMask{a:T,b:T & vcount{T}*2<=elwidth{T}} = { +def homMask{...as if length{as}>1 and elwidth{type{select{as,0}}}>=32} = homMask{...each{{i}=>narrowPair{select{as,i*2},select{as,i*2+1}}, iota{length{as}/2}}} +def homMask{a:T,b:T if vcount{T}*2<=elwidth{T}} = { def n = vcount{T} truncBits{n*2, fold_add{shrm{a,elwidth{T}-n,b} & make{T, (1<<iota{n}) | (1<<(iota{n}+n))}}} } -def andAllZero{x:T, y:T & nveci{T}} = ~bitAny{x&y} +def andAllZero{x:T, y:T if nveci{T}} = ~bitAny{x&y} -def homMaskX{a:T & eltype{T}!=u64} = { +def homMaskX{a:T if eltype{T}!=u64} = { def h = elwidth{T}/2 tup{h, truncBits{vcount{T}*h, extract{[1]u64~~shrn{el_m{T}~~a, h}, 0}}} } -def homMaskStoreF{p:P, m:M, v:T & nveci{M} & nvec{T,elwidth{M}} & eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}} +def homMaskStoreF{p:P, m:M, v:T if nveci{M} and nvec{T,elwidth{M}} and eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}} diff --git a/src/singeli/src/replicate.singeli b/src/singeli/src/replicate.singeli index d4b8395d..bc1c6dea 100644 --- a/src/singeli/src/replicate.singeli +++ b/src/singeli/src/replicate.singeli @@ -22,7 +22,7 @@ def scan_core{upd, set, scan, rp:pT, wp:W, s:(usz)} = { } def indrep_by_sum{T, rp:(*T), wp, s:(usz), js, inc} = { def scan{ptr, len} = @for (ptr over len) js=ptr+=js - def scan{ptr, len & width{T}<=32} = { + def scan{ptr, len if width{T}<=32} = { def scanfn = merge{'si_scan_pluswrap_u',fmtnat{width{T}}} p := *ty_u{eltype{type{ptr}}}~~ptr emit{void, scanfn, p, p, len, js}; js=load{ptr,len-1} @@ -103,12 +103,12 @@ rcsh4_lkup:*i8 = shiftright{0, scan{+, fold{|, table{==, rcsh4_dom, iota{64}}}}} def read_shuf_vecs{l, ellw:(u64), shp:P} = { # tuple of byte selectors in 1<<ellw def V = eltype{P} - def double{x:X & hasarch{'AVX2'}} = { + def double{x:X if hasarch{'AVX2'}} = { s:=shuf{[4]u64, x, 4b3120}; s+=s r:=each{bind{~~,[32]i8},unpackQ{s, s + X**1}} r } - def double{x:X & hasarch{'AARCH64'}} = { + def double{x:X if hasarch{'AARCH64'}} = { s:= x+x zip{s, s + X**1} } @@ -193,8 +193,8 @@ fn rep_const_shuffle_partial4(wv:u64, ellw:u64, x:*i8, r:*i8, n:u64) : void = { def step = vcount{V} # Bytes written def wvb = wv << ellw def hs = (h*step) / wvb # Actual step size in argument elements - def shufbase{i & hasarch{'AVX2'}} = shuf{[4]u64, load{*V~~(x+i)}, 4b1010} - def shufbase{i & hasarch{'AARCH64'}} = load{*V~~(x+i)} + def shufbase{i if hasarch{'AVX2'}} = shuf{[4]u64, load{*V~~(x+i)}, 4b1010} + def shufbase{i if hasarch{'AARCH64'}} = load{*V~~(x+i)} def shufrun{a, s} = sel{[16]i8, a, s} # happens to be the same across AVX2 & NEON i:u64 = 0 diff --git a/src/singeli/src/scan.singeli b/src/singeli/src/scan.singeli index 78f7294c..b6719273 100644 --- a/src/singeli/src/scan.singeli +++ b/src/singeli/src/scan.singeli @@ -35,7 +35,7 @@ def scan_post{T, init, x:*T, r:*T, len:(u64), op, pre} = { # Associative scan ?` if a?b?a = a?b = b?a, used for ⌊⌈ def scan_idem = scan_scal -fn scan_idem{T, op & hasarch{'X86_64'}}(x:*T, r:*T, len:u64, init:T) : void = { +fn scan_idem{T, op if hasarch{'X86_64'}}(x:*T, r:*T, len:u64, init:T) : void = { scan_post{T, init, x, r, len, op, make_scan_idem{T, op}} } @@ -54,7 +54,7 @@ export{'si_scan_min_i32', scan_idem_id{i32, min}}; export{'si_scan_max_i32', sca # Assumes identity is 0 def scan_assoc{op} = { def shl0{v, k} = shl{[16]u8, v, k/8} # Lanewise - def shl0{v:V, k==128 & hasarch{'AVX2'}} = { + def shl0{v:V, k==128 if hasarch{'AVX2'}} = { # Broadcast end of lane 0 to entire lane 1 l:= V~~make{[8]i32,0,0,0,-1,0,0,0,0} & spread{v} sel{[8]i32, l, make{[8]i32, 3*(3<iota{8})}} @@ -65,7 +65,7 @@ def scan_plus = scan_assoc{+} # Associative scan def scan_assoc_0 = scan_scal -fn scan_assoc_0{T, op & hasarch{'X86_64'}}(x:*T, r:*T, len:u64, init:T) : void = { +fn scan_assoc_0{T, op if hasarch{'X86_64'}}(x:*T, r:*T, len:u64, init:T) : void = { # Prefix op on entire AVX register scan_post{T, init, x, r, len, op, scan_plus} } @@ -80,7 +80,7 @@ fn scan_neq{}(p:u64, x:*u64, r:*u64, nw:u64) : void = { p = -(r>>63) # repeat sign bit } } -fn clmul_scan_ne_any{& hasarch{'PCLMUL'}}(x:*void, r:*void, init:u64, words:u64, mark:u64) : void = { +fn clmul_scan_ne_any{if hasarch{'PCLMUL'}}(x:*void, r:*void, init:u64, words:u64, mark:u64) : void = { def V = [2]u64 m := V**mark def xor64{a, i, carry} = { # carry is 64-bit broadcasted current total @@ -101,10 +101,10 @@ fn clmul_scan_ne_any{& hasarch{'PCLMUL'}}(x:*void, r:*void, init:u64, words:u64, storeLow{rv+e, 64, clmul{loadLow{xv+e, 64}, m, 0} ^ c} } } -fn scan_neq{& hasarch{'PCLMUL'}}(init:u64, x:*u64, r:*u64, nw:u64) : void = { +fn scan_neq{if hasarch{'PCLMUL'}}(init:u64, x:*u64, r:*u64, nw:u64) : void = { clmul_scan_ne_any{}(*void~~x, *void~~r, init, nw, -(u64~~1)) } -fn scan_neq{& hasarch{'AVX512BW', 'VPCLMULQDQ', 'GFNI'}}(init:u64, x:*u64, r:*u64, nw:u64) : void = { +fn scan_neq{if hasarch{'AVX512BW', 'VPCLMULQDQ', 'GFNI'}}(init:u64, x:*u64, r:*u64, nw:u64) : void = { def V = [8]u64 def sse{a} = make{[2]u64, a, 0} carry := sse{init} @@ -136,7 +136,7 @@ fn bcs{T}(x:*u64, r:*T, l:u64) : void = { c:T = 0 @for (r over i to l) { c+= cast_i{T, bitp_get{x,i}}; r = c } } -fn bcs{T & hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = { +fn bcs{T if hasarch{'AVX2'}}(x:*u64, r:*T, l:u64) : void = { def U = ty_u{T} def w = width{T} def vl= 256 / w @@ -202,7 +202,7 @@ def addChk{a:T, b:T} = { def bad = emit{u1, '__builtin_add_overflow', a, b, mem} tup{bad, load{mem}} } -def addChk{a:T, b:T & T==f64} = tup{0, a+b} +def addChk{a:T, b:T if T==f64} = tup{0, a+b} def widenFull{E, xs} = { merge{...each{{x:X} => { @@ -219,9 +219,9 @@ def widenFull{E, xs} = { }, xs}} } -def floor{x & knum{x}} = x - (x%1) -def maxabsval{T & issigned{T}} = -minvalue{T} -def maxsafeint{T & issigned{T}} = maxvalue{T} +def floor{x if knum{x}} = x - (x%1) +def maxabsval{T if issigned{T}} = -minvalue{T} +def maxsafeint{T if issigned{T}} = maxvalue{T} def maxsafeint{T==f64} = 1<<53 fn plus_scan{X, R, O}(x:*X, c:R, r:*R, len:u64) : O = { diff --git a/src/singeli/src/scan_common.singeli b/src/singeli/src/scan_common.singeli index d4cf5ad1..0d861f2e 100644 --- a/src/singeli/src/scan_common.singeli +++ b/src/singeli/src/scan_common.singeli @@ -1,9 +1,9 @@ # Used by scan.singeli and bins.singeli def sel8{v:V, t} = sel{[16]u8, v, make{re_el{i8,V}, t}} -def sel8{v:V, t & w256{V} & istup{t} & length{t}==16} = sel8{v, merge{t,t}} +def sel8{v:V, t if w256{V} and istup{t} and length{t}==16} = sel8{v, merge{t,t}} -def shuf{T, v, n & istup{n}} = shuf{T, v, base{4,n}} +def shuf{T, v, n if istup{n}} = shuf{T, v, base{4,n}} local def rev{t} = { def l=length{t}; def j=l-1; select{j-t, j-range{l}} } local def rev{up,t} = if (up) t else rev{t} @@ -19,14 +19,14 @@ def spread{a:VT, ...up} = { } # Set all elements with the last element of the input -def toLast{n:VT, up & hasarch{'X86_64'} & w128{VT}} = { +def toLast{n:VT, up if hasarch{'X86_64'} and w128{VT}} = { def l{v, w} = l{zip{up,v}, 2*w} - def l{v, w & hasarch{'SSSE3'}} = sel8{v, up*(16-w/8)+iota{16}%(w/8)} + def l{v, w if hasarch{'SSSE3'}} = sel8{v, up*(16-w/8)+iota{16}%(w/8)} def l{v, w==32} = shuf{[4]i32, v, 4**(up*3)} def l{v, w==64} = shuf{[4]i32, v, (2*up) + tup{0,1,0,1}} l{n, elwidth{VT}} } -def toLast{n:VT, up & hasarch{'AVX2'} & w256{VT}} = { +def toLast{n:VT, up if hasarch{'AVX2'} and w256{VT}} = { if (elwidth{VT}<=32) sel{[8]i32, spread{n,up}, [8]i32**(up*7)} else shuf{[4]u64, n, 4**(up*3)} } @@ -51,17 +51,17 @@ def make_scan_idem{T, op, up} = { def id = make{V, merger{c**get_id{op,T}, (width{V}/w-c)**0}} (if (up) shl else shr){[16]u8, v, k/8} | id } - def shb{v, k & hasarch{'SSSE3'}} = sel8{v, shift{k/8,16}} - def shb{v, k & k>=32} = shuf{[4]u32, v, shift{k/32,4}} - def shb{v, k & k==128 & hasarch{'AVX2'}} = { + def shb{v, k if hasarch{'SSSE3'}} = sel8{v, shift{k/8,16}} + def shb{v, k if k>=32} = shuf{[4]u32, v, shift{k/32,4}} + def shb{v, k if k==128 and hasarch{'AVX2'}} = { # After lanewise scan, broadcast end of lane 0 to entire lane 1 sel{[8]i32, spread{v,up}, make{[8]i32, rev{up,3*(3<iota{8})}}} } prefix_byshift{op, shb} } def make_scan_idem{T==f64, op, up} = { - def sc{a:T & vcount{T}==2} = op{a, zip{~up,a}} - def sc{a:T & hasarch{'AVX2'} & w256{T}} = { + def sc{a:T if vcount{T}==2} = op{a, zip{~up,a}} + def sc{a:T if hasarch{'AVX2'} and w256{T}} = { def sh{s, a} = op{a, shuf{[4]u64, a, rev{up,s}}} sh{tup{0,1,1,1},sh{tup{0,0,2,2},a}} } diff --git a/src/singeli/src/search.singeli b/src/singeli/src/search.singeli index e4e7c1d3..b3f9f3de 100644 --- a/src/singeli/src/search.singeli +++ b/src/singeli/src/search.singeli @@ -79,7 +79,7 @@ def simd_bittab = hasarch{'SSSE3'} def bittab_init{tab, z} = { @for (t in *TI~~tab over 256) t = z } -def bittab_init{tab, z & simd_bittab} = { +def bittab_init{tab, z if simd_bittab} = { init:= VI**z @unroll (t in *VI~~tab over 256/vcount{VI}) t = init } @@ -124,7 +124,7 @@ def bittab_lookup{x0:(*void), n:(u64), r0:(*void), tab:(*void)} = { x+=k; rem-=k; ++r } } -def bittab_lookup{x0:(*void), n:(u64), r0:(*void), tab:(*void) & simd_bittab} = { +def bittab_lookup{x0:(*void), n:(u64), r0:(*void), tab:(*void) if simd_bittab} = { def {bitsel, _} = bittab_selector{readbytes{*VI~~tab}} def k = vcount{VI} @for (x in *VI~~x0, r in *ty_u{k}~~r0 over cdiv{n,k}) r = bitsel{x} @@ -295,11 +295,11 @@ def acc{unr, init:T} = { else a1 = F{a1} } } -def isI64{x:T & eltype{T}==f64 & hasarch{'AARCH64'}} = x == cvt{f64, cvt{i64, x}} -def isI64{x:T & eltype{T}==f64 & hasarch{'SSE4.1'}} = (x==floor{x}) & (abs{x}<=T**(1<<53)) +def isI64{x:T if eltype{T}==f64 and hasarch{'AARCH64'}} = x == cvt{f64, cvt{i64, x}} +def isI64{x:T if eltype{T}==f64 and hasarch{'SSE4.1'}} = (x==floor{x}) & (abs{x}<=T**(1<<53)) def maskBlend{b:T, x:T, M} = x -def maskBlend{b:T, x:T, M & M{0}} = homBlend{b, x, M{T, 'to homogeneous bits'}} +def maskBlend{b:T, x:T, M if M{0}} = homBlend{b, x, M{T, 'to homogeneous bits'}} fn getRange{E}(x0:*void, res:*i64, n:u64) : u1 = { assert{n>0} @@ -600,7 +600,7 @@ fn hashtab{T, name}(rpi:*rty{name}, iv:*void, mi:usz, fv:*void, ni:usz, links:it def try_vec_memb{..._} = {} def try_vec_memb{T, hash, sz, sh, maxh, has_maxh, swap, rp, fp, n, done - & hasarch{'SSE4.2'} & T==u32} = { + if hasarch{'SSE4.2'} and T==u32} = { # Hash h wants bin h>>sh, so the offset for h in slot i is (in infite-precision ints) # i-h>>sh = i+((1<<sh-1)-h)>>sh = (((i+1)<<sh-1)-h)>>sh # We maintain io = (i+1)<<sh-1 diff --git a/src/singeli/src/select.singeli b/src/singeli/src/select.singeli index 55fb35ab..983e4e2b 100644 --- a/src/singeli/src/select.singeli +++ b/src/singeli/src/select.singeli @@ -9,11 +9,11 @@ include 'util/tup' # def:T - masked original content # b:B - pointer to data to index; if width{B}<elwidth{T}, padding bytes are garbage read after wanted position # idx - actual (unscaled) index list -def gather{d:T, b:B, idx:([8]i32), M & w256{T,32}} = { +def gather{d:T, b:B, idx:([8]i32), M if w256{T,32}} = { if (M{0}) T ~~ emit{[8]i32, '_mm256_mask_i32gather_epi32', d, *void~~b, idx, M{T,'to sign bits'}, elwidth{B}/8} else T ~~ emit{[8]i32, '_mm256_i32gather_epi32', *void~~b, idx, elwidth{B}/8} } -def gather{d:T, b:B, idx:([4]i32), M & w256{T,64}} = { +def gather{d:T, b:B, idx:([4]i32), M if w256{T,64}} = { if (M{0}) T ~~ emit{[4]i64, '_mm256_mask_i32gather_epi64', d, *void~~b, idx, M{T,'to sign bits'}, elwidth{B}/8} else T ~~ emit{[4]i64, '_mm256_i32gather_epi64', *void~~b, idx, elwidth{B}/8} } @@ -70,7 +70,7 @@ def makeselx{VI, VD, nsel, xd, logv, cshuf} = { def bb{c}{f, v} = (if (f) bblendn{c<v}; else bblend{(c&v)==v}) def bs{b, c, x} = cshuf{x, c} - def bs{b, c, x & length{b}>0} = { + def bs{b, c, x if length{b}>0} = { select{b,0}{each{bs{slice{b,1}, c, .}, x}} } diff --git a/src/singeli/src/slash.singeli b/src/singeli/src/slash.singeli index b22c3b72..2f6cdc9b 100644 --- a/src/singeli/src/slash.singeli +++ b/src/singeli/src/slash.singeli @@ -7,8 +7,8 @@ if_inline (hasarch{'X86_64'}) { include './mask' include 'util/tup' -def popcRand{x:T & isint{T} & width{T}==64} = emit{u8, 'rand_popc64', x} # under valgrind, return a random result in the range of possible ones -def popcRand{x:T & isint{T} & width{T}<=32} = emit{u8, 'rand_popc64', x} +def popcRand{x:T if isint{T} and width{T}==64} = emit{u8, 'rand_popc64', x} # under valgrind, return a random result in the range of possible ones +def popcRand{x:T if isint{T} and width{T}<=32} = emit{u8, 'rand_popc64', x} # Table from l bits to w-bit indices, shifted left by s, and G applied afterwards def maketab{l,w,s,G} = { @@ -64,7 +64,7 @@ def for_special_buffered{r, write_len}{vars,begin,sum,iter} = { } else { if (has_simd) { def bufw = bufn * tw - def vc = tern{hasarch{'X86_64'} & (bufw==128), 128, arch_defvw} / tw; + def vc = tern{hasarch{'X86_64'} and bufw==128, 128, arch_defvw} / tw; def R = [vc]T @unroll ((ov/vc)>>0) if (end-buf>vc) { store{*R~~r0, 0, load{*R~~buf}}; r0+=vc; buf+=vc } assert{bufw % width{R} == 0} # to make sure the below doesn't read out-of-bounds on the stack @@ -118,7 +118,7 @@ def topper{T, U, k, x} = { itab_4_16:*u64 = maketab{4,16} # 16 elts, 128B def thresh{c==0, T==i8 } = 32 def thresh{c==0, T==i16} = 16 -fn slash{c==0, T & T<=i16}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { +fn slash{c==0, T if T<=i16}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { def tw = width{T} def n = 64/tw def tab = if (tw==8) itab else itab_4_16 @@ -140,9 +140,9 @@ fn slash{c==0, T & T<=i16}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { # i16 /w & i32 x+/w; 8 elts/iter; 64 bit table input, expanded to 128 or 256 via topper def simd128{} = hasarch{'X86_64'} | hasarch{'AARCH64'} -def thresh{c==0, T==i16 & simd128{}} = 32 -def thresh{c==0, T==i32 & simd128{}} = 16 -fn slash{c==0, T & simd128{} & i16<=T & T<=i32}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { +def thresh{c==0, T==i16 if simd128{}} = 32 +def thresh{c==0, T==i32 if simd128{}} = 16 +fn slash{c==0, T if simd128{} and i16<=T and T<=i32}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { def I = [16]i8 j := I**(if (T==i16) 0 else cast_i{i8,x}) def {top, inctop} = topper{T, I, 8, x} @@ -161,9 +161,9 @@ fn slash{c==0, T & simd128{} & i16<=T & T<=i32}(w:*u64, x:arg{c,T}, r:*T, l:u64, # i8 & i16 w/x; 128 bits/iter; [16]i8 shuffle def shufb128{} = hasarch{'SSSE3'} | hasarch{'AARCH64'} -def thresh{c==1, T==i8 & shufb128{}} = 64 -def thresh{c==1, T==i16 & shufb128{}} = 32 -fn slash{c==1, T & T<=i16 & shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { +def thresh{c==1, T==i8 if shufb128{}} = 64 +def thresh{c==1, T==i16 if shufb128{}} = 32 +fn slash{c==1, T if T<=i16 and shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { def V = [16]i8 @for_special_buffered{r,8} (w in *u8~~wp over i to sum) { ind := load{itab, w} @@ -179,8 +179,8 @@ fn slash{c==1, T & T<=i16 & shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u6 # i32 w/x; 8 elts/iter into 2 steps; [16]i8 shuffle i32tab:*u32 = maketab{4,8,2} # 16 elts, 64B -def thresh{c==1, T==i32 & shufb128{}} = 8 -fn slash{c==1, T==i32 & shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { +def thresh{c==1, T==i32 if shufb128{}} = 8 +fn slash{c==1, T==i32 if shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { def V = [16]i8 expander := make{V, iota{16}>>2} trail := make{V, tail{2,iota{16}}} @@ -202,9 +202,9 @@ fn slash{c==1, T==i32 & shufb128{}}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : # i32 & i64 w/x & x+/w; 256 bits/step, 8 elts/iter; [8]i32 shuffle i64tab:*u64 = maketab{4,16,1,{x}=>(1+x)*0x100 + x} # 16 elts, 128B -def thresh{c, T==i32 & hasarch{'AVX2'}} = 32 -def thresh{c, T==i64 & hasarch{'AVX2'}} = 8 -fn slash{c, T & hasarch{'AVX2'} & T>=i32}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { +def thresh{c, T==i32 if hasarch{'AVX2'}} = 32 +def thresh{c, T==i64 if hasarch{'AVX2'}} = 8 +fn slash{c, T if hasarch{'AVX2'} and T>=i32}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { def tw = width{T} def V = [8]u32 expander := make{[32]u8, merge{...each{tup{., ... 3**128}, iota{8}>>lb{tw/32}}}} @@ -235,11 +235,11 @@ fn slash{c, T & hasarch{'AVX2'} & T>=i32}(wp:*u64, x:arg{c,T}, r:*T, l:u64, sum: } # everything; 512 bits/iter; AVX-512 compress -def thresh{c, T==i8 & hasarch{'AVX512VBMI2'}} = 256 -def thresh{c, T==i16 & hasarch{'AVX512VBMI2'}} = 128 -def thresh{c, T==i32 & hasarch{'AVX512F'}} = 64 -def thresh{c, T==i64 & hasarch{'AVX512F'}} = 16 -fn slash{c, T & hasarch{if (width{T}>=32) 'AVX512F' else 'AVX512VBMI2'}}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { +def thresh{c, T==i8 if hasarch{'AVX512VBMI2'}} = 256 +def thresh{c, T==i16 if hasarch{'AVX512VBMI2'}} = 128 +def thresh{c, T==i32 if hasarch{'AVX512F'}} = 64 +def thresh{c, T==i64 if hasarch{'AVX512F'}} = 16 +fn slash{c, T if hasarch{if (width{T}>=32) 'AVX512F' else 'AVX512VBMI2'}}(w:*u64, x:arg{c,T}, r:*T, l:u64, sum:u64) : void = { def f = fmtnat def wt = width{T} def vl = 512/wt @@ -283,7 +283,7 @@ def pext_popc{x:T, m:T} = { # - z tells how many bits in the group are NOT used # - x contains the bits, with z zeros above def build{k==1} = tup{x&m, ~m} - def build{k & k > 1} = { + def build{k if k > 1} = { def h = k>>1 # Increase size from h to k {x,z} := build{h} def low_s = lowbits{w,k} # Low bit in each new group @@ -326,12 +326,12 @@ def pext_popc{x:T, m:T} = { pe := fold{|, x&s0, each{gr, g*slice{iota{k/g},1}}} tup{pe, o>>(k-g)} } - def build{k==32 & hasarch{'AVX2'} & isvec{T}} = { + def build{k==32 if hasarch{'AVX2'} and isvec{T}} = { def S = re_el{ty_u{k}, T} def c{T,vs} = each{{v}=>T~~v, vs} c{T, multi_shift{...c{S, build{8}}, 8, k, {s}=>S**s}} } - def build{k & ~isvec{T} & k > 8} = { + def build{k if not isvec{T} and k > 8} = { multi_shift{...build{8}, 8, k, {s}=>s} } # Final result @@ -339,9 +339,9 @@ def pext_popc{x:T, m:T} = { tup{pe, scal{w} - z} } -def pext_width {& hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 2 -def thresh_bool{& hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 32 -def pext_popc{x0:V, m0:V & hasarch{'PCLMUL'} & V==[2]u64} = { +def pext_width {if hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 2 +def thresh_bool{if hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 32 +def pext_popc{x0:V, m0:V if hasarch{'PCLMUL'} and V==[2]u64} = { def clmul{a, b} = zipLo{...@collect (j to 2) clmul{a,b,j}} m := m0 x := x0 & m @@ -359,9 +359,9 @@ def pext_popc{x0:V, m0:V & hasarch{'PCLMUL'} & V==[2]u64} = { tup{x, @collect (j to 2) popc{extract{m0,j}}} } -def pext_width {& fast_BMI2{}} = 1 -def thresh_bool{& fast_BMI2{}} = 512 -def pext_popc{x:T, m:T & fast_BMI2{} & T==u64} = tup{pext{x, m}, popc{m}} +def pext_width {if fast_BMI2{}} = 1 +def thresh_bool{if fast_BMI2{}} = 512 +def pext_popc{x:T, m:T if fast_BMI2{} and T==u64} = tup{pext{x, m}, popc{m}} fn compress_bool(w:*u64, x:*u64, r:*u64, n:u64) : void = { cw:u64 = 0; # current word @@ -375,7 +375,7 @@ fn compress_bool(w:*u64, x:*u64, r:*u64, n:u64) : void = { } ro = ro2%64 } - def extract{t, i & istup{t}} = select{t,i} + def extract{t, i if istup{t}} = select{t,i} def v = pext_width{} if (v > 1) { def V = [v]u64 diff --git a/src/singeli/src/squeeze.singeli b/src/singeli/src/squeeze.singeli index a794c995..2d16a10f 100644 --- a/src/singeli/src/squeeze.singeli +++ b/src/singeli/src/squeeze.singeli @@ -7,24 +7,24 @@ include './vecfold' def preserve_negative_zero = 0 # SSE2 versions avoid any 64-bit integer comparsions -def anySNaN{M, x:T & eltype{T}==u64} = { +def anySNaN{M, x:T if eltype{T}==u64} = { homAny{inRangeLen{M{x}<<1, (0xFFE<<52)+2, (1<<52)-2}} } -def anySNaN{M, x:T & T==[2]u64 & hasarch{'X86_64'} & ~hasarch{'SSE4.2'}} = { +def anySNaN{M, x:T if T==[2]u64 and hasarch{'X86_64'} and not hasarch{'SSE4.2'}} = { topAny{M{andnot{unord{[2]f64~~x, [2]f64~~x}, [2]u64~~([4]u32**0xFFF8_0000 == ([4]u32~~x | [4]u32**0x8000_0000))}}} } -def anyNonChar{M, x:T & isvec{T} & eltype{T}==u64} = homAny{M{~inRangeLen{x, cbqn_c32Tag{}<<48, 1<<48}}} -def anyNonChar{M, x:T & isvec{T} & hasarch{'X86_64'}} = { +def anyNonChar{M, x:T if isvec{T} and eltype{T}==u64} = homAny{M{~inRangeLen{x, cbqn_c32Tag{}<<48, 1<<48}}} +def anyNonChar{M, x:T if isvec{T} and hasarch{'X86_64'}} = { def H = re_el{u32, T} def ne = H~~x != H**cast_i{u32, cbqn_c32Tag{}<<16} topAny{M{T~~ne}} } -def cvtNarrow{T, x:X & width{T}==elwidth{X}} = cvt{T, x} -def cvtNarrow{T, x:X & width{T}< elwidth{X}} = narrow{T, x} -def cvtWiden{T, x:X & elwidth{T}==elwidth{X}} = cvt{eltype{T}, x} -def cvtWiden{T, x:X & elwidth{T}> elwidth{X}} = widen{T, x} +def cvtNarrow{T, x:X if width{T}==elwidth{X}} = cvt{T, x} +def cvtNarrow{T, x:X if width{T}< elwidth{X}} = narrow{T, x} +def cvtWiden{T, x:X if elwidth{T}==elwidth{X}} = cvt{eltype{T}, x} +def cvtWiden{T, x:X if elwidth{T}> elwidth{X}} = widen{T, x} fn squeeze{vw, X, CHR, B}(x0:*void, len:ux) : u32 = { assert{len>0} @@ -36,7 +36,7 @@ fn squeeze{vw, X, CHR, B}(x0:*void, len:ux) : u32 = { # fold with either Max or Bitwise Or, truncating/zero-extending to TE def foldTotal{TE, x:T} = cast_i{TE, vfold{|, x}} - def foldTotal{TE, x:T & hasarch{'AARCH64'}} = { + def foldTotal{TE, x:T if hasarch{'AARCH64'}} = { if (elwidth{T}==64) { if (width{TE}==64 and bulk==2) cast_i{TE, half{x,0} | half{x,1}} else vfold{max, narrow{TE, x}} diff --git a/src/singeli/src/sse.singeli b/src/singeli/src/sse.singeli index a190880c..cc1a6309 100644 --- a/src/singeli/src/sse.singeli +++ b/src/singeli/src/sse.singeli @@ -1,32 +1,32 @@ ### SSSE3 ### -def sel{L, x:T, i:I & hasarch{'SSSE3'} & lvec{L,16,8} & w128{T} & w128i{I, 8}} = T ~~ emit{[16]u8, '_mm_shuffle_epi8', v2i{x}, i} -def vshl{a:T, b:T, n & hasarch{'SSSE3'}} = T~~emit{[16]u8, '_mm_alignr_epi8', v2i{b}, v2i{a}, n*(elwidth{T}/8)} +def sel{L, x:T, i:I if hasarch{'SSSE3'} and lvec{L,16,8} and w128{T} and w128i{I, 8}} = T ~~ emit{[16]u8, '_mm_shuffle_epi8', v2i{x}, i} +def vshl{a:T, b:T, n if hasarch{'SSSE3'}} = T~~emit{[16]u8, '_mm_alignr_epi8', v2i{b}, v2i{a}, n*(elwidth{T}/8)} ### SSE4.1 ### -def packs{a:T,b:T & hasarch{'SSE4.1'} & T==[4]u32} = emit{[ 8]u16, '_mm_packus_epi32', a, b} -def andAllZero{x:T, y:T & hasarch{'SSE4.1'} & w128i{T}} = emit{u1, '_mm_testz_si128', x, y} +def packs{a:T,b:T if hasarch{'SSE4.1'} and T==[4]u32} = emit{[ 8]u16, '_mm_packus_epi32', a, b} +def andAllZero{x:T, y:T if hasarch{'SSE4.1'} and w128i{T}} = emit{u1, '_mm_testz_si128', x, y} # conversion -def widen{T==[8]u16, x:X & hasarch{'SSE4.1'} & X==[16]u8} = emit{T, '_mm_cvtepu8_epi16', x}; def widen{T==[8]i16, x:X & hasarch{'SSE4.1'} & X==[16]i8} = emit{T, '_mm_cvtepi8_epi16', x} -def widen{T==[4]u32, x:X & hasarch{'SSE4.1'} & X==[16]u8} = emit{T, '_mm_cvtepu8_epi32', x}; def widen{T==[4]i32, x:X & hasarch{'SSE4.1'} & X==[16]i8} = emit{T, '_mm_cvtepi8_epi32', x} -def widen{T==[4]u32, x:X & hasarch{'SSE4.1'} & X==[8]u16} = emit{T, '_mm_cvtepu16_epi32', x}; def widen{T==[4]i32, x:X & hasarch{'SSE4.1'} & X==[8]i16} = emit{T, '_mm_cvtepi16_epi32', x} -def widen{T==[2]u64, x:X & hasarch{'SSE4.1'} & X==[16]u8} = emit{T, '_mm_cvtepu8_epi64', x}; def widen{T==[2]i64, x:X & hasarch{'SSE4.1'} & X==[16]i8} = emit{T, '_mm_cvtepi8_epi64', x} -def widen{T==[2]u64, x:X & hasarch{'SSE4.1'} & X==[8]u16} = emit{T, '_mm_cvtepu16_epi64', x}; def widen{T==[2]i64, x:X & hasarch{'SSE4.1'} & X==[8]i16} = emit{T, '_mm_cvtepi16_epi64', x} -def widen{T==[2]u64, x:X & hasarch{'SSE4.1'} & X==[4]u32} = emit{T, '_mm_cvtepu32_epi64', x}; def widen{T==[2]i64, x:X & hasarch{'SSE4.1'} & X==[4]i32} = emit{T, '_mm_cvtepi32_epi64', x} -def widen{T==[2]f64, x:X & hasarch{'SSE4.1'} & w128i{X} & elwidth{X}<32} = widen{T, widen{[4]i32, x}} +def widen{T==[8]u16, x:X if hasarch{'SSE4.1'} and X==[16]u8} = emit{T, '_mm_cvtepu8_epi16', x}; def widen{T==[8]i16, x:X if hasarch{'SSE4.1'} and X==[16]i8} = emit{T, '_mm_cvtepi8_epi16', x} +def widen{T==[4]u32, x:X if hasarch{'SSE4.1'} and X==[16]u8} = emit{T, '_mm_cvtepu8_epi32', x}; def widen{T==[4]i32, x:X if hasarch{'SSE4.1'} and X==[16]i8} = emit{T, '_mm_cvtepi8_epi32', x} +def widen{T==[4]u32, x:X if hasarch{'SSE4.1'} and X==[8]u16} = emit{T, '_mm_cvtepu16_epi32', x}; def widen{T==[4]i32, x:X if hasarch{'SSE4.1'} and X==[8]i16} = emit{T, '_mm_cvtepi16_epi32', x} +def widen{T==[2]u64, x:X if hasarch{'SSE4.1'} and X==[16]u8} = emit{T, '_mm_cvtepu8_epi64', x}; def widen{T==[2]i64, x:X if hasarch{'SSE4.1'} and X==[16]i8} = emit{T, '_mm_cvtepi8_epi64', x} +def widen{T==[2]u64, x:X if hasarch{'SSE4.1'} and X==[8]u16} = emit{T, '_mm_cvtepu16_epi64', x}; def widen{T==[2]i64, x:X if hasarch{'SSE4.1'} and X==[8]i16} = emit{T, '_mm_cvtepi16_epi64', x} +def widen{T==[2]u64, x:X if hasarch{'SSE4.1'} and X==[4]u32} = emit{T, '_mm_cvtepu32_epi64', x}; def widen{T==[2]i64, x:X if hasarch{'SSE4.1'} and X==[4]i32} = emit{T, '_mm_cvtepi32_epi64', x} +def widen{T==[2]f64, x:X if hasarch{'SSE4.1'} and w128i{X} and elwidth{X}<32} = widen{T, widen{[4]i32, x}} -def narrow{T, x:X & hasarch{'SSE4.1'} & w128i{X,32} & T==i8} = sel{[16]u8, [16]i8~~x, make{[16]i8, 0,4,8,12, 0,0,0,0, 0,0,0,0, 0,0,0,0}} -def narrow{T, x:X & hasarch{'SSE4.1'} & w128i{X,32} & T==i16} = sel{[16]u8, [8]i16~~x, make{[16]i8, 0,1,4,5, 8,9,12,13, 0,0,0,0, 0,0,0,0}} +def narrow{T, x:X if hasarch{'SSE4.1'} and w128i{X,32} and T==i8} = sel{[16]u8, [16]i8~~x, make{[16]i8, 0,4,8,12, 0,0,0,0, 0,0,0,0, 0,0,0,0}} +def narrow{T, x:X if hasarch{'SSE4.1'} and w128i{X,32} and T==i16} = sel{[16]u8, [8]i16~~x, make{[16]i8, 0,1,4,5, 8,9,12,13, 0,0,0,0, 0,0,0,0}} # mask stuff -def andAllZero{x:T, y:T & hasarch{'SSE4.1'} & w128i{T}} = emit{u1, '_mm_testz_si128', x, y} -def topBlend{f:T, t:T, m:M & hasarch{'SSE4.1'} & w128{T} & w128i{M,32}} = T ~~ emit{[4]f32, '_mm_blendv_ps', v2f{f}, v2f{t}, v2f{m}} -def topBlend{f:T, t:T, m:M & hasarch{'SSE4.1'} & w128{T} & w128i{M,64}} = T ~~ emit{[2]f64, '_mm_blendv_pd', v2d{f}, v2d{t}, v2d{m}} -def topBlend{f:T, t:T, m:M & hasarch{'SSE4.1'} & w128{T} & w128i{M, 8}} = T ~~ emit{[16]i8, '_mm_blendv_epi8', v2i{f}, v2i{t}, v2i{m}} +def andAllZero{x:T, y:T if hasarch{'SSE4.1'} and w128i{T}} = emit{u1, '_mm_testz_si128', x, y} +def topBlend{f:T, t:T, m:M if hasarch{'SSE4.1'} and w128{T} and w128i{M,32}} = T ~~ emit{[4]f32, '_mm_blendv_ps', v2f{f}, v2f{t}, v2f{m}} +def topBlend{f:T, t:T, m:M if hasarch{'SSE4.1'} and w128{T} and w128i{M,64}} = T ~~ emit{[2]f64, '_mm_blendv_pd', v2d{f}, v2d{t}, v2d{m}} +def topBlend{f:T, t:T, m:M if hasarch{'SSE4.1'} and w128{T} and w128i{M, 8}} = T ~~ emit{[16]i8, '_mm_blendv_epi8', v2i{f}, v2i{t}, v2i{m}} # assumes all bits are the same in each mask item -def homBlend{f:T, t:T, m:M & hasarch{'SSE4.1'} & w128{T} & w128{M} & elwidth{M}!=16} = topBlend{f, t, m} -def homBlend{f:T, t:T, m:M & hasarch{'SSE4.1'} & w128{T} & w128{M,16}} = topBlend{f, t, [16]i8~~m} +def homBlend{f:T, t:T, m:M if hasarch{'SSE4.1'} and w128{T} and w128{M} and elwidth{M}!=16} = topBlend{f, t, m} +def homBlend{f:T, t:T, m:M if hasarch{'SSE4.1'} and w128{T} and w128{M,16}} = topBlend{f, t, [16]i8~~m} diff --git a/src/singeli/src/sse2.singeli b/src/singeli/src/sse2.singeli index 8e82ad46..eff66f82 100644 --- a/src/singeli/src/sse2.singeli +++ b/src/singeli/src/sse2.singeli @@ -1,37 +1,37 @@ # compact casting for the annoying intrinsic type system -def v2i{x:T & w128{T}} = if(isint{eltype{T}}) x else [16]u8 ~~ x -def v2f{x:T & w128{T}} = [4]f32 ~~ x -def v2d{x:T & w128{T}} = [2]f64 ~~ x +def v2i{x:T if w128{T}} = if(isint{eltype{T}}) x else [16]u8 ~~ x +def v2f{x:T if w128{T}} = [4]f32 ~~ x +def v2d{x:T if w128{T}} = [2]f64 ~~ x # load & store -def loadLow{ptr:P, w & w128{eltype{P}} & w== 16} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si16', ptr} -def loadLow{ptr:P, w & w128{eltype{P}} & w== 32} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si32', ptr} -def loadLow{ptr:P, w & w128{eltype{P}} & w== 64} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si64', ptr} -def loadLow{ptr:P, w & w128{eltype{P}} & w==128} = load{ptr} +def loadLow{ptr:P, w if w128{eltype{P}} and w== 16} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si16', ptr} +def loadLow{ptr:P, w if w128{eltype{P}} and w== 32} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si32', ptr} +def loadLow{ptr:P, w if w128{eltype{P}} and w== 64} = eltype{P} ~~ emit{[16]u8, '_mm_loadu_si64', ptr} +def loadLow{ptr:P, w if w128{eltype{P}} and w==128} = load{ptr} -def storeLow{ptr:P, w, x:T & w128{T} & w== 16} = emit{void, '_mm_storeu_si16', ptr, v2i{x}} -def storeLow{ptr:P, w, x:T & w128{T} & w== 32} = emit{void, '_mm_storeu_si32', ptr, v2i{x}} -def storeLow{ptr:P, w, x:T & w128{T} & w== 64} = emit{void, '_mm_storeu_si64', ptr, v2i{x}} -def storeLow{ptr:P, w, x:T & w128{T} & w==128} = store{*T~~ptr, 0, x} +def storeLow{ptr:P, w, x:T if w128{T} and w== 16} = emit{void, '_mm_storeu_si16', ptr, v2i{x}} +def storeLow{ptr:P, w, x:T if w128{T} and w== 32} = emit{void, '_mm_storeu_si32', ptr, v2i{x}} +def storeLow{ptr:P, w, x:T if w128{T} and w== 64} = emit{void, '_mm_storeu_si64', ptr, v2i{x}} +def storeLow{ptr:P, w, x:T if w128{T} and w==128} = store{*T~~ptr, 0, x} # float comparison -def unord{a:T,b:T & T==[4]f32} = [4]u32~~emit{[4]f32, '_mm_cmpunord_ps', a, b} -def unord{a:T,b:T & T==[2]f64} = [2]u64~~emit{[2]f64, '_mm_cmpunord_pd', a, b} +def unord{a:T,b:T if T==[4]f32} = [4]u32~~emit{[4]f32, '_mm_cmpunord_ps', a, b} +def unord{a:T,b:T if T==[2]f64} = [2]u64~~emit{[2]f64, '_mm_cmpunord_pd', a, b} # shift -def shl{S==[16]u8, x:T, n & w128{T}} = T ~~ emit{T, '_mm_bslli_si128', x, n} -def shr{S==[16]u8, x:T, n & w128{T}} = T ~~ emit{T, '_mm_bsrli_si128', x, n} +def shl{S==[16]u8, x:T, n if w128{T}} = T ~~ emit{T, '_mm_bslli_si128', x, n} +def shr{S==[16]u8, x:T, n if w128{T}} = T ~~ emit{T, '_mm_bsrli_si128', x, n} # integer arith -def mulh{a:T,b:T & [8]i16==T} = emit{T, '_mm_mulhi_epi16', a, b} -def mulh{a:T,b:T & [8]u16==T} = emit{T, '_mm_mulhi_epu16', a, b} -def mul32{a:T,b:T & [2]u64==T} = emit{T, '_mm_mul_epu32', a, b} # reads only low 32 bits of arguments -def __mul{a:T,b:T & [4]i32==T} = { +def mulh{a:T,b:T if [8]i16==T} = emit{T, '_mm_mulhi_epi16', a, b} +def mulh{a:T,b:T if [8]u16==T} = emit{T, '_mm_mulhi_epu16', a, b} +def mul32{a:T,b:T if [2]u64==T} = emit{T, '_mm_mul_epu32', a, b} # reads only low 32 bits of arguments +def __mul{a:T,b:T if [4]i32==T} = { def mu{x, y} = [4]i32 ~~ mul32{[2]u64~~x, [2]u64~~y} def sw{n, ...vs} = each{{c} => shuf{[4]i32, c, n}, vs} lo:= mu{a, b} @@ -44,55 +44,55 @@ def rsqrtE{a:([4]f32)} = emit{[4]f32, '_mm_rsqrt_ps', a} def rcpE{a:([4]f32)} = emit{[4]f32, '_mm_rcp_ps', a} # mask stuff -def andAllZero{x:T, y:T & w128i{T}} = homAll{(x & y) == T**0} +def andAllZero{x:T, y:T if w128i{T}} = homAll{(x & y) == T**0} -def topMask{x:T & w128{T, 8}} = emit{u16, '_mm_movemask_epi8', x} -def topMask{x:T & w128{T, 16}} = topMask{packs{[8]i16~~x, [8]i16**0}} -def topMask{x:T & w128{T, 32}} = emit{u8, '_mm_movemask_ps', v2f{x}} -def topMask{x:T & w128{T, 64}} = emit{u8, '_mm_movemask_pd', v2d{x}} -def homMask{x:T & w128{T}} = topMask{x} -def homMaskX{a:T & elwidth{T}==16} = tup{2, homMask{re_el{u8,a}}} -def homMask{a:T, b:T & w128i{T,16}} = homMask{packs{ty_s{a},ty_s{b}}} +def topMask{x:T if w128{T, 8}} = emit{u16, '_mm_movemask_epi8', x} +def topMask{x:T if w128{T, 16}} = topMask{packs{[8]i16~~x, [8]i16**0}} +def topMask{x:T if w128{T, 32}} = emit{u8, '_mm_movemask_ps', v2f{x}} +def topMask{x:T if w128{T, 64}} = emit{u8, '_mm_movemask_pd', v2d{x}} +def homMask{x:T if w128{T}} = topMask{x} +def homMaskX{a:T if elwidth{T}==16} = tup{2, homMask{re_el{u8,a}}} +def homMask{a:T, b:T if w128i{T,16}} = homMask{packs{ty_s{a},ty_s{b}}} -def homAny{x:T & w128i{T}} = homMask{[16]u8 ~~ x} != 0 -def homAll{x:T & w128i{T}} = homMask{[16]u8 ~~ x} == 0xffff +def homAny{x:T if w128i{T}} = homMask{[16]u8 ~~ x} != 0 +def homAll{x:T if w128i{T}} = homMask{[16]u8 ~~ x} == 0xffff -def topAny{x:T & w128i{T}} = topMask{x} != 0 -def topAll{x:T & w128i{T}} = topMask{x} == (1<<vcount{T})-1 -def topAny{x:T & w128i{T, 16}} = homAny{[8]i16~~x < [8]i16**0} -def topAll{x:T & w128i{T, 16}} = homAll{[8]i16~~x < [8]i16**0} +def topAny{x:T if w128i{T}} = topMask{x} != 0 +def topAll{x:T if w128i{T}} = topMask{x} == (1<<vcount{T})-1 +def topAny{x:T if w128i{T, 16}} = homAny{[8]i16~~x < [8]i16**0} +def topAll{x:T if w128i{T, 16}} = homAll{[8]i16~~x < [8]i16**0} # bits of other things SSE2 has -def packs{a:T,b:T & T==[8]i16} = emit{[16]i8, '_mm_packs_epi16', a, b} -def packs{a:T,b:T & T==[4]i32} = emit{[ 8]i16, '_mm_packs_epi32', a, b} -def packs{a:T,b:T & T==[8]u16} = emit{[16]u8, '_mm_packus_epi16', a, b} -def packQ{a:T,b:T & w128i{T}} = packs{a,b} +def packs{a:T,b:T if T==[8]i16} = emit{[16]i8, '_mm_packs_epi16', a, b} +def packs{a:T,b:T if T==[4]i32} = emit{[ 8]i16, '_mm_packs_epi32', a, b} +def packs{a:T,b:T if T==[8]u16} = emit{[16]u8, '_mm_packus_epi16', a, b} +def packQ{a:T,b:T if w128i{T}} = packs{a,b} -def zipLo{a:T, b:T & w128i{T}} = emit{T, merge{'_mm_unpacklo_epi',fmtnat{elwidth{T}}}, a, b} -def zipHi{a:T, b:T & w128i{T}} = emit{T, merge{'_mm_unpackhi_epi',fmtnat{elwidth{T}}}, a, b} -def zipLo{a:T, b:T & w128f{T}} = emit{T, merge{'_mm_unpacklo_p',if (elwidth{T}==32) 's' else 'd'}, a, b} -def zipHi{a:T, b:T & w128f{T}} = emit{T, merge{'_mm_unpackhi_p',if (elwidth{T}==32) 's' else 'd'}, a, b} -def zip{a:T, b:T & w128i{T}} = tup{zipLo{a,b}, zipHi{a,b}} +def zipLo{a:T, b:T if w128i{T}} = emit{T, merge{'_mm_unpacklo_epi',fmtnat{elwidth{T}}}, a, b} +def zipHi{a:T, b:T if w128i{T}} = emit{T, merge{'_mm_unpackhi_epi',fmtnat{elwidth{T}}}, a, b} +def zipLo{a:T, b:T if w128f{T}} = emit{T, merge{'_mm_unpacklo_p',if (elwidth{T}==32) 's' else 'd'}, a, b} +def zipHi{a:T, b:T if w128f{T}} = emit{T, merge{'_mm_unpackhi_p',if (elwidth{T}==32) 's' else 'd'}, a, b} +def zip{a:T, b:T if w128i{T}} = tup{zipLo{a,b}, zipHi{a,b}} -def unpackQ{a:T, b:T & w128{T}} = mzip{a, b} +def unpackQ{a:T, b:T if w128{T}} = mzip{a, b} -def shuf{L, x:T, n & w128{T} & lvec{L,4,32} & knum{n}} = T ~~ emit{[4]i32, '_mm_shuffle_epi32', v2i{x}, n} +def shuf{L, x:T, n if w128{T} and lvec{L,4,32} and knum{n}} = T ~~ emit{[4]i32, '_mm_shuffle_epi32', v2i{x}, n} def shuf16Lo{x:T, n} = T~~emit{[8]i16, '_mm_shufflelo_epi16', x, n} def shuf16Hi{x:T, n} = T~~emit{[8]i16, '_mm_shufflehi_epi16', x, n} -def homBlend{f:T, t:T, m:M & w128{T} & w128i{M,elwidth{T}}} = T ~~ ((M~~t & m) | (M~~f &~ m)) -def homMaskStoreF{p:P, m:M, v:T & w128i{M} & w128{T,elwidth{M}} & eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}} +def homBlend{f:T, t:T, m:M if w128{T} and w128i{M,elwidth{T}}} = T ~~ ((M~~t & m) | (M~~f &~ m)) +def homMaskStoreF{p:P, m:M, v:T if w128i{M} and w128{T,elwidth{M}} and eltype{P}==T} = store{p, 0, homBlend{load{p}, v, m}} -def widen{T, x:X & w128i{T} & w128i{X} & w128s{T}==w128s{X} & elwidth{T}>elwidth{X}} = { +def widen{T, x:X if w128i{T} and w128i{X} and w128s{T}==w128s{X} and elwidth{T}>elwidth{X}} = { def s{v} = s{mzipLo{v, v}} - def s{v:V & V==T} = v + def s{v:V if V==T} = v s{x} >> (elwidth{T} - elwidth{X}) } -def widen{T==[2]f64, x:X & w128s{X} & elwidth{X}<32} = widen{T, widen{[4]i32, x}} -def widen{T==[2]f64, x:X & X==[4]i32} = emit{T, '_mm_cvtepi32_pd', x} -def widen{T==[2]f64, x:X & X==[4]f32} = emit{T, '_mm_cvtps_pd', x} +def widen{T==[2]f64, x:X if w128s{X} and elwidth{X}<32} = widen{T, widen{[4]i32, x}} +def widen{T==[2]f64, x:X if X==[4]i32} = emit{T, '_mm_cvtepi32_pd', x} +def widen{T==[2]f64, x:X if X==[4]f32} = emit{T, '_mm_cvtps_pd', x} def narrow{T==i16, x:([4]i32)} = packs{x,x} def narrow{T==i8, x:([8]i16)} = packs{x,x} @@ -104,5 +104,5 @@ def narrow{T==u8, x:([2]u64)} = { def f{v} = narrow{u8, [8]u16~~v}; f{f{f{x}}}} def narrow{T==u16, x:([2]u64)} = shuf16Lo{[8]u16~~shuf{[4]i32, x, 4b3320}, 4b3320} def narrow{T==u32, x:([2]u64)} = [4]u32~~shuf{[4]i32, x, 4b3320} -def narrow{T, x:X & w128f{X,64} & T<i32} = narrow{T, narrow{i32, x}} +def narrow{T, x:X if w128f{X,64} and T<i32} = narrow{T, narrow{i32, x}} def narrow{T==i32, x:([2]f64)} = emit{[4]i32, '_mm_cvtpd_epi32', x} diff --git a/src/singeli/src/transpose.singeli b/src/singeli/src/transpose.singeli index 8ebab38e..45d8dbfd 100644 --- a/src/singeli/src/transpose.singeli +++ b/src/singeli/src/transpose.singeli @@ -25,16 +25,16 @@ def unpack_to{f, l, x} = { def shuf_pass{x} = each{{v} => shuf{[4]i64, v, 4b3120}, x} # Square kernel where width is a full vector -def transpose_square{VT, l, x & hasarch{'AVX2'}} = unpack_to{1, l/2, x} +def transpose_square{VT, l, x if hasarch{'AVX2'}} = unpack_to{1, l/2, x} def load2{a:T, b:T} = pair{load{a}, load{b}} -def store2{a:T, b:T, v:T2 & w128i{eltype{T}} & w256{T2}} = { +def store2{a:T, b:T, v:T2 if w128i{eltype{T}} and w256{T2}} = { each{{p:P, i} => store{p, 0, eltype{P}~~half{v,i}}, tup{a,b}, iota{2}} } -def load_k {VT, src, l, w & w256{VT}} = each{{i} =>load {*VT~~(src+i*w), 0 }, iota{l}} -def store_k{VT, dst, x, l, h & w256{VT}} = each{{i,v}=>store{*VT~~(dst+i*h), 0, VT~~v}, iota{l}, x} -def load_k {VT, src, l, w & w128{VT}} = each{{i} =>{p:=src+ i*w; load2 {*VT~~p, *VT~~(p+l*w) }}, iota{l}} -def store_k{VT, dst, x, l, h & w128{VT}} = each{{i,v}=>{p:=dst+2*i*h; store2{*VT~~p, *VT~~(p+ h), v}}, iota{l}, x} +def load_k {VT, src, l, w if w256{VT}} = each{{i} =>load {*VT~~(src+i*w), 0 }, iota{l}} +def store_k{VT, dst, x, l, h if w256{VT}} = each{{i,v}=>store{*VT~~(dst+i*h), 0, VT~~v}, iota{l}, x} +def load_k {VT, src, l, w if w128{VT}} = each{{i} =>{p:=src+ i*w; load2 {*VT~~p, *VT~~(p+l*w) }}, iota{l}} +def store_k{VT, dst, x, l, h if w128{VT}} = each{{i,v}=>{p:=dst+2*i*h; store2{*VT~~p, *VT~~(p+ h), v}}, iota{l}, x} # Transpose kernel of size kw,kh in size w,h array def kernel{src:P, dst:P, kw, kh, w, h} = { diff --git a/src/singeli/src/vecfold.singeli b/src/singeli/src/vecfold.singeli index f0aadea2..92a57f45 100644 --- a/src/singeli/src/vecfold.singeli +++ b/src/singeli/src/vecfold.singeli @@ -1,6 +1,6 @@ # Fold associative/commutative operation across a register -def vfold{F, x:T & w128{T} & hasarch{'X86_64'}} = { +def vfold{F, x:T if w128{T} and hasarch{'X86_64'}} = { c:= x def EW = elwidth{T} if (EW<=64) c = F{c, shuf{[4]u32, c, 4b1032}} @@ -9,4 +9,4 @@ def vfold{F, x:T & w128{T} & hasarch{'X86_64'}} = { if (EW==8) { v:=extract{[8]i16~~c, 0}; F{cast_i{eltype{T}, v}, cast_i{eltype{T}, v>>8}} } else extract{c, 0} } -def vfold{F, x:T & w256{T} & hasarch{'X86_64'}} = vfold{F, F{half{x, 0}, half{x, 1}}} +def vfold{F, x:T if w256{T} and hasarch{'X86_64'}} = vfold{F, F{half{x, 0}, half{x, 1}}} |