diff options
author | dzaima <dzaimagit@gmail.com> | 2024-01-01 16:40:19 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-01 16:40:19 +0200 |
commit | a82d3e1c68143b750906bfd2a6396d83b9b9b97e (patch) | |
tree | c8cc03c527125266d4161ea1037828c7c99b7b09 | |
parent | 9931c1756c8f775f9a1e0abf813351a3a52d88f8 (diff) | |
parent | dbb6fbade19c1bc367fd6e4faf9dd5d4aa893a4b (diff) |
Merge pull request #102 from mlochbaum/misc
Miscellaneous
-rwxr-xr-x | build/src/build.bqn | 16 | ||||
-rw-r--r-- | src/builtins/cells.c | 5 | ||||
-rw-r--r-- | src/builtins/slash.c | 88 | ||||
-rw-r--r-- | src/builtins/transpose.c | 75 | ||||
-rw-r--r-- | src/singeli/src/base.singeli | 2 | ||||
-rw-r--r-- | src/singeli/src/replicate.singeli (renamed from src/singeli/src/constrep.singeli) | 72 | ||||
-rw-r--r-- | src/singeli/src/slash.singeli | 10 | ||||
-rw-r--r-- | src/singeli/src/transpose.singeli | 12 |
8 files changed, 192 insertions, 88 deletions
diff --git a/build/src/build.bqn b/build/src/build.bqn index e8c4981d..615e8526 100755 --- a/build/src/build.bqn +++ b/build/src/build.bqn @@ -659,10 +659,10 @@ cachedBin‿linkerCache ← { "xa."‿"src/utils/bits.c"‿"bits", "xag"‿"src/builtins/transpose.c"‿"transpose", "xag"‿"src/builtins/search.c"‿"search", "xag"‿"src/builtins/selfsearch.c"‿"selfsearch" "xag"‿"src/builtins/scan.c"‿"scan", "xa."‿"src/builtins/fold.c"‿"fold", - "xag"‿"src/builtins/sort.c"‿"bins" + "xag"‿"src/builtins/slash.c"‿"slash", "xag"‿"src/builtins/slash.c"‿"replicate", + "xag"‿"src/builtins/sort.c"‿"bins", "xa."‿"src/builtins/slash.c"‿"count" - "x.."‿"src/builtins/select.c"‿"select", "xa."‿"src/builtins/slash.c"‿"constrep", - "xag"‿"src/builtins/slash.c"‿"slash", "xa."‿"src/builtins/slash.c"‿"count" + "x.."‿"src/builtins/select.c"‿"select" ⟩ objs ← ⟨⟩ @@ -694,10 +694,12 @@ cachedBin‿linkerCache ← { •file.Name ga, ⟨⟩ ⟩ - singeliArgs ← po.singeliFlags∾⟨"-l", "gen="∾AtRoot singeliCache.folder, "-c", "usz=u"∾•Repr po.usz⟩∾{ - po.native? ⟨⟩; - "-a" ⋈ 1↓∾ ','⊸∾¨ po.singeliArch - } + singeliArgs ← ∾⟨ + po.singeliFlags + ⟨"-l", "gen="∾AtRoot singeliCache.folder, "-c", "usz=u"∾•Repr po.usz⟩ + (⊑"slow-pdep"<⊸∊po.has)/⟨"-c", "SLOW_PDEP=1"⟩ + {po.native? ⟨⟩; "-a" ⋈ 1↓∾ ','⊸∾¨ po.singeliArch} + ⟩ {𝕊: "Singeli args: "∾•Repr singeliArgs} _verboseLog @ {𝕊: "Singeli-required C args: "∾•Repr po.siCFlags; @} _verboseLog @ singeliObjs ↩ {MakeSingeliInv ⟨singeliArgs, {𝕊:UpdateSubmodule po.singeliDir}, singeliCache, 𝕩, "src/singeli/src/"•file.At 𝕩∾".singeli", (𝕩≡"dyarith")/⟨gaRule⟩⟩}¨ 1⊑¨singeliMap diff --git a/src/builtins/cells.c b/src/builtins/cells.c index 1ff71961..354fb8f5 100644 --- a/src/builtins/cells.c +++ b/src/builtins/cells.c @@ -9,6 +9,7 @@ B shape_c2(B, B, B); B transp_c2(B, B, B); B fold_rows(Md1D* d, B x); // from fold.c B takedrop_highrank(bool take, B w, B x); // from sfns.c +B try_interleave_cells(B w, B x, ur xr, ur xk, usz* xsh); // from transpose.c // X - variable name; XSH - its shape; K - number of leading axes that get iterated over; SLN - number of slices that will be made; DX - additional refcount count to add to x #define S_KSLICES(X, XSH, K, SLN, DX)\ @@ -609,6 +610,10 @@ NOINLINE B for_cells_AA(B f, B w, B x, ur wcr, ur xcr, u32 chr) { if (rsh) shcpy(rsh, zsh, zk); decG(w); decG(x); return taga(r); } + if (rtid==n_couple && wr==xr) { + B r = try_interleave_cells(w, x, xr, xk, xsh); + if (!q_N(r)) { decG(w); decG(x); return r; } + } } if (isPervasiveDy(f)) { if (TI(w,elType)==el_B || TI(x,elType)==el_B) goto generic; diff --git a/src/builtins/slash.c b/src/builtins/slash.c index d1dcc5d0..90b9c924 100644 --- a/src/builtins/slash.c +++ b/src/builtins/slash.c @@ -86,12 +86,13 @@ #endif #if SINGELI + extern void (*const si_scan_pluswrap_u8)(uint8_t* v0,uint8_t* v1,uint64_t v2,uint8_t v3); + extern void (*const si_scan_pluswrap_u16)(uint16_t* v0,uint16_t* v1,uint64_t v2,uint16_t v3); + extern void (*const si_scan_pluswrap_u32)(uint32_t* v0,uint32_t* v1,uint64_t v2,uint32_t v3); + extern void (*const si_scan_max_i32)(int32_t* v0,int32_t* v1,uint64_t v2); #define SINGELI_FILE slash #include "../utils/includeSingeli.h" -#endif - -#if SINGELI_AVX2 || SINGELI_NEON - #define SINGELI_FILE constrep + #define SINGELI_FILE replicate #include "../utils/includeSingeli.h" #endif @@ -100,20 +101,6 @@ #include "../utils/includeSingeli.h" #endif -#if SINGELI - extern void (*const si_scan_pluswrap_u8)(uint8_t* v0,uint8_t* v1,uint64_t v2,uint8_t v3); - extern void (*const si_scan_pluswrap_u16)(uint16_t* v0,uint16_t* v1,uint64_t v2,uint16_t v3); - extern void (*const si_scan_pluswrap_u32)(uint32_t* v0,uint32_t* v1,uint64_t v2,uint32_t v3); - #define ALIAS(I,U) static void si_scan_pluswrap_##I(I* a, I* b, u64 c, I d) { si_scan_pluswrap_##U((U*)a, (U*)b, c, d); } - ALIAS(i8,u8) ALIAS(i16,u16) ALIAS(i32,u32) - #undef ALIAS - #define si_scan_pluswrap_u64(V0,V1,V2,V3) for (usz i=k; i<e; i++) js=rp[i]+=js; - #define PLUS_SCAN(T) si_scan_pluswrap_##T(rp+k,rp+k,e-k,js); js=rp[e-1]; - extern void (*const si_scan_max_i32)(int32_t* v0,int32_t* v1,uint64_t v2); -#else - #define PLUS_SCAN(T) for (usz i=k; i<e; i++) js=rp[i]+=js; -#endif - // Dense Where, still significantly worse than SIMD // Assumes modifiable DST #define WHERE_DENSE(SRC, DST, LEN, OFF) do { \ @@ -537,32 +524,6 @@ static B compress(B w, B x, usz wia, u8 xl, u8 xt) { return r; } -// Replicate using plus/max/xor-scan -#define SCAN_CORE(WV, UPD, SET, SCAN) \ - usz b = 1<<10; \ - for (usz k=0, j=0, ij=WV; ; ) { \ - usz e = b<s-k? k+b : s; \ - for (usz i=k; i<e; i++) rp[i]=0; \ - SET; \ - while (ij<e) { j++; UPD; ij+=WV; } \ - SCAN; \ - if (e==s) {break;} k=e; \ - } -#define SUM_CORE(T, WV, PREP, INC) \ - SCAN_CORE(WV, PREP; rp[ij]+=INC, , PLUS_SCAN(T)) - -#if SINGELI_AVX2 - #define IND_BY_SCAN \ - SCAN_CORE(xp[j], rp[ij]=j, rp[k]=j, si_scan_max_i32(rp+k,rp+k,e-k)) -#else - #define IND_BY_SCAN usz js=0; SUM_CORE(i32, xp[j], , 1) -#endif - -#define REP_BY_SCAN(T, WV) \ - T* xp = xv; T* rp = rv; \ - T js=xp[0], px=js; \ - SUM_CORE(T, WV, T sx=px, (px=xp[j])-sx) - #define BOOL_REP_XOR_SCAN(WV) \ usz b = 1<<12; \ u64 xx=xp[0], xs=xx>>63, js=-(xx&1); xx^=xx<<1; \ @@ -622,14 +583,13 @@ B slash_c1(B t, B x) { for (u64 j = 0; j < c; j++) *rp++ = i; } } else { + #if SINGELI if (s/32 <= xia) { // Sparse case: type of x matters - #define SPARSE_IND(T) T* xp = T##any_ptr(x); IND_BY_SCAN i32* rp; r = m_i32arrv(&rp, s); - if (xe == el_i8 ) { SPARSE_IND(i8 ); } - else if (xe == el_i16) { SPARSE_IND(i16); } - else { SPARSE_IND(i32); } - #undef SPARSE_IND - } else { // Dense case: only result type matters + si_indices_scan_i32[elwByteLog(xe)](tyany_ptr(x), rp, s); + } else + #endif + { // Dense case: only result type matters #define DENSE_IND(T) \ T* rp; r = m_##T##arrv(&rp, s); \ for (u64 i = 0; i < xia; i++) { \ @@ -748,17 +708,12 @@ B slash_c2(B t, B w, B x) { void* rv = m_tyarrlv(&r, xk, s, xt); if (rsh) { Arr* ra=a(r); SPRNK(ra,xr); ra->sh = rsh; ra->ia = s*arr_csz(x); } void* xv = tyany_ptr(x); + #if SINGELI if ((xk<3? s/64 : s/32) <= wia) { // Sparse case: use both types - #define CASE(L,XT) case L: { REP_BY_SCAN(XT, wp[j]) break; } - #define SPARSE_REP(WT) \ - WT* wp = WT##any_ptr(w); \ - switch (xk) { default: UD; CASE(0,u8) CASE(1,u16) CASE(2,u32) CASE(3,u64) } - if (we == el_i8 ) { SPARSE_REP(i8 ); } - else if (we == el_i16) { SPARSE_REP(i16); } - else { SPARSE_REP(i32); } - #undef SPARSE_REP - #undef CASE - } else { // Dense case: only type of x matters + si_replicate_scan[4*elwByteLog(we) + xk](tyany_ptr(w), xv, rv, s); + } else + #endif + { // Dense case: only type of x matters #define CASE(L,T) case L: { \ T* xp = xv; T* rp = rv; \ for (usz i = 0; i < wia; i++) { \ @@ -845,13 +800,18 @@ B slash_c2(B t, B w, B x) { u8 xk = xl-3; void* rv = m_tyarrv(&r, 1<<xk, s, xt); void* xv = tyany_ptr(x); - #if SINGELI_AVX2 || SINGELI_NEON - simd_constrep[xk](wv, xv, rv, xlen); + #if SINGELI + si_constrep[xk](wv, xv, rv, xlen); #else - #define CASE(L,T) case L: { REP_BY_SCAN(T, wv) break; } + #define CASE(L,T) case L: { \ + T* xp = xv; T* rp = rv; \ + for (usz i = 0; i < xlen; i++) { \ + for (i64 j = 0; j < wv; j++) *rp++ = xp[i]; \ + } \ + } break; switch (xk) { default: UD; CASE(0,u8) CASE(1,u16) CASE(2,u32) CASE(3,u64) } - #endif #undef CASE + #endif } atmW_maybesh:; diff --git a/src/builtins/transpose.c b/src/builtins/transpose.c index 331b0127..63dba290 100644 --- a/src/builtins/transpose.c +++ b/src/builtins/transpose.c @@ -2,7 +2,7 @@ // Transpose // One length-2 axis: dedicated code -// Boolean: pdep for height 2; pext for width 2 +// Boolean: pdep or emulation for height 2; pext for width 2 // SHOULD use a generic implementation if BMI2 not present // SHOULD optimize other short lengths with pdep/pext and shuffles // Boolean 𝕩: convert to integer @@ -40,6 +40,9 @@ #include "../utils/calls.h" #ifdef __BMI2__ + #if !SLOW_PDEP + #define FAST_PDEP 1 + #endif #include <immintrin.h> #if USE_VALGRIND #define _pdep_u64 vg_pdep_u64 @@ -66,6 +69,67 @@ typedef void (*TranspFn)(void*,void*,u64,u64,u64,u64); #endif +static void interleave_bits(u64* rp, void* x0v, void* x1v, usz n) { + u32* x0 = (u32*)x0v; u32* x1 = (u32*)x1v; + for (usz i=0; i<BIT_N(n); i++) { + #if FAST_PDEP + rp[i] = _pdep_u64(x0[i], 0x5555555555555555) | _pdep_u64(x1[i], 0xAAAAAAAAAAAAAAAA); + #else + #define STEP(V,M,SH) V = (V | V<<SH) & M; + #define EXPAND(V) \ + STEP(V, 0x0000ffff0000ffff, 16) \ + STEP(V, 0x00ff00ff00ff00ff, 8) \ + STEP(V, 0x0f0f0f0f0f0f0f0f, 4) \ + STEP(V, 0x3333333333333333, 2) \ + STEP(V, 0x5555555555555555, 1) + u64 e0 = x0[i]; EXPAND(e0); + u64 e1 = x1[i]; EXPAND(e1); + rp[i] = e0 | e1<<1; + #undef EXPAND + #undef STEP + #endif + } +} + +// Interleave arrays, 𝕨≍⎉(-xk)𝕩. Doesn't consume. +// Return bi_N if there isn't fast code. +B try_interleave_cells(B w, B x, ur xr, ur xk, usz* xsh) { + assert(RNK(w)==xr && xr>=1); + u8 xe = TI(x,elType); if (xe!=TI(w,elType)) return bi_N; + usz csz = shProd(xsh, xk, xr); + if (csz & (csz-1)) return bi_N; // Not power of 2 + u8 xlw = elwBitLog(xe); + usz n = shProd(xsh, 0, xk); + usz ia = 2*n*csz; + Arr *r; + if (csz==1 && xlw==0) { + u64* rp; r=m_bitarrp(&rp, ia); + interleave_bits(rp, bitarr_ptr(w), bitarr_ptr(x), ia); + } else + #if SINGELI + if (csz==1 && xe==el_B) { + B* wp = TO_BPTR(w); B* xp = TO_BPTR(x); + HArr_p p = m_harrUv(ia); // Debug build complains with harrUp + si_interleave[3](p.a, wp, xp, n); + for (usz i=0; i<ia; i++) inc(p.a[i]); + NOGC_E; + B rb = p.b; + if (SFNS_FILLS) rb = qWithFill(rb, fill_both(w, x)); + r = a(rb); + } else if (csz<=64>>xlw && csz<<xlw>=8) { // Require CPU-sized cells + assert(xe!=el_B); + void* rv; + if (xlw==0) { u64* rp; r = m_bitarrp(&rp, ia); rv=rp; } + else rv = m_tyarrp(&r,elWidth(xe),ia,el2t(xe)); + si_interleave[CTZ(csz<<xlw)-3](rv, tyany_ptr(w), tyany_ptr(x), n); + } else + #endif + return bi_N; + usz* sh = arr_shAlloc(r, xr+1); + shcpy(sh, xsh, xk); sh[xk]=2; shcpy(sh+xk+1, xsh+xk, xr-xk); + return taga(r); +} + static void transpose_move(void* rv, void* xv, u8 xe, usz w, usz h) { assert(xe!=el_bit); assert(xe!=el_B); transposeFns[elwByteLog(xe)](rv, xv, w, h, w, h); @@ -87,14 +151,12 @@ static Arr* transpose_noshape(B* px, usz ia, usz w, usz h) { r=a(qWithFill(p.b, xf)); } else if (xe==el_bit) { - #ifdef __BMI2__ if (h==2) { - u32* x0 = (u32*)bitarr_ptr(x); u64* rp; r=m_bitarrp(&rp, ia); Arr* x1o = TI(x,slice)(inc(x),w,w); - u32* x1 = (u32*) ((TyArr*)x1o)->a; - for (usz i=0; i<BIT_N(ia); i++) rp[i] = _pdep_u64(x0[i], 0x5555555555555555) | _pdep_u64(x1[i], 0xAAAAAAAAAAAAAAAA); + interleave_bits(rp, bitarr_ptr(x), ((TyArr*)x1o)->a, ia); mm_free((Value*)x1o); + #ifdef __BMI2__ } else if (w==2) { u64* xp = bitarr_ptr(x); u64* r0; r=m_bitarrp(&r0, ia); @@ -106,9 +168,8 @@ static Arr* transpose_noshape(B* px, usz ia, usz w, usz h) { } bit_cpyN(r0, h, r1, 0, h); TFREE(r1); - } else #endif - { + } else { *px = x = taga(cpyI8Arr(x)); xe=el_i8; void* rv = m_tyarrp(&r,elWidth(xe),ia,el2t(xe)); void* xv = tyany_ptr(x); diff --git a/src/singeli/src/base.singeli b/src/singeli/src/base.singeli index 68ded7f3..d5e11120 100644 --- a/src/singeli/src/base.singeli +++ b/src/singeli/src/base.singeli @@ -5,6 +5,7 @@ include 'util/kind' def ux = u64 config usz = u32 +config SLOW_PDEP = 0 def same = is oper ~~ reinterpret infix right 55 @@ -157,6 +158,7 @@ def base{b,l} = if (0==tuplen{l}) 0 else tupsel{0,l}+b*base{b,slice{l,1}} # vector definitions def arch_defvw = if (hasarch{'AVX2'}) 256 else 128 def has_simd = hasarch{'X86_64'} | hasarch{'AARCH64'} +def fast_BMI2{} = if (SLOW_PDEP) 0 else hasarch{'BMI2'} # test if vector has a specific width & element type def lvec{T, n, w} = 0 diff --git a/src/singeli/src/constrep.singeli b/src/singeli/src/replicate.singeli index 28e6b0a0..82c981ea 100644 --- a/src/singeli/src/constrep.singeli +++ b/src/singeli/src/replicate.singeli @@ -1,8 +1,72 @@ include './base' -if (hasarch{'AVX2'} | hasarch{'AARCH64'}) { - include './mask' include 'util/tup' + +def ind_types = tup{i8, i16, i32} +def dat_types = tup{...ind_types, u64} + +# Indices and Replicate using plus- or max-scan +def scan_core{upd, set, scan, rp:pT, wp:W, s:(usz)} = { + def getw{j} = if (isptr{W}) cast_i{usz,load{wp,j}} else wp + b:usz = 1<<10 + k:usz = 0; j:usz = 0; ij:=getw{j} + while (1) { + e := tern{b<s-k, k+b, s} + @for (rp over i from k to e) rp = 0 + if (set) store{rp, k, cast_i{eltype{pT},j}} + while (ij<e) { ++j; upd{rp, j, ij}; ij+=getw{j} } + scan{rp+k, e-k} + if (e==s) return{} + k = e + } +} +def indrep_by_sum{T, rp:*T, wp, s:(usz), js, inc} = { + def scan{ptr, len} = @for (ptr over len) js=ptr+=js + def scan{ptr, len & width{T}<=32} = { + def scanfn = merge{'si_scan_pluswrap_u',fmtnat{width{T}}} + p := *ty_u{eltype{type{ptr}}}~~ptr + emit{void, scanfn, p, p, len, js}; js=load{ptr,len-1} + } + def upd{rp, j, ij} = store{rp, ij, load{rp,ij}+inc{j}} + scan_core{upd, 0, scan, rp, wp, s} +} + +fn ind_by_scan_i32{W}(xv:*void, rp:*i32, s:usz) : void = { + xp := *W~~xv + if (hasarch{'X86_64'} & ~hasarch{'SSE4.1'}) { # no min instruction + js:i32 = 0 + indrep_by_sum{i32, rp, xp, s, js, {j}=>1} + } else { + scan_core{ + {rp,j,ij} => store{rp,ij,cast_i{i32,j}}, 1, + {ptr,len} => emit{void, 'si_scan_max_i32', ptr,ptr,len}, + rp, xp, s + } + } +} + +def rep_by_scan{T, wp, xv:*void, rv:*void, s} = { + xp := *T~~xv; js := *xp; px := js + def inc{j} = {sx:=px; px=load{xp,j}; px-sx} + indrep_by_sum{T, *T~~rv, wp, s, js, inc} +} +fn rep_by_scan{W, T}(wp:*void, xv:*void, rv:*void, s:usz) : void = { + rep_by_scan{T, *W~~wp, xv, rv, s} +} + +exportT{'si_indices_scan_i32', each{ind_by_scan_i32, ind_types}} +exportT{'si_replicate_scan', flat_table{rep_by_scan, ind_types, dat_types}} + + +# Constant replicate +if (not (hasarch{'AVX2'} | hasarch{'AARCH64'})) { + +fn rep_const{T}(wv:u64, x:*void, r:*void, n:u64) : void = { + rep_by_scan{T, cast_i{usz,wv}, x, r, cast_i{usz, wv*n}} +} + +} else { + def incl{a,b} = slice{iota{b+1},a} # 1+˝∨`⌾⌽0=div|⌜range @@ -220,6 +284,6 @@ fn rep_const{T}(wv:u64, x:*void, r:*void, n:u64) : void = { } } -exportT{'simd_constrep', each{rep_const, tup{i8, i16, i32, u64}}} +} -}
\ No newline at end of file +exportT{'si_constrep', each{rep_const, dat_types}} diff --git a/src/singeli/src/slash.singeli b/src/singeli/src/slash.singeli index e1560dd4..e66b97fa 100644 --- a/src/singeli/src/slash.singeli +++ b/src/singeli/src/slash.singeli @@ -339,8 +339,8 @@ def pext_popc{x:T, m:T} = { tup{pe, scal{w} - z} } -def pext_width {..._ & hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 2 -def thresh_bool{..._ & hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 32 +def pext_width {& hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 2 +def thresh_bool{& hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 32 def pext_popc{x0:V, m0:V & hasarch{'PCLMUL'} & V==[2]u64} = { def clmul{a, b} = zipLo{...@collect (j to 2) clmul{a,b,j}} m := m0 @@ -359,9 +359,9 @@ def pext_popc{x0:V, m0:V & hasarch{'PCLMUL'} & V==[2]u64} = { tup{x, @collect (j to 2) popc{extract{m0,j}}} } -def pext_width {..._ & hasarch{'BMI2'}} = 1 -def thresh_bool{..._ & hasarch{'BMI2'}} = 512 -def pext_popc{x:T, m:T & hasarch{'BMI2'} & T==u64} = tup{pext{x, m}, popc{m}} +def pext_width {& fast_BMI2{}} = 1 +def thresh_bool{& fast_BMI2{}} = 512 +def pext_popc{x:T, m:T & fast_BMI2{} & T==u64} = tup{pext{x, m}, popc{m}} fn compress_bool(w:*u64, x:*u64, r:*u64, n:u64) : void = { cw:u64 = 0; # current word diff --git a/src/singeli/src/transpose.singeli b/src/singeli/src/transpose.singeli index 08dbc977..4fe11838 100644 --- a/src/singeli/src/transpose.singeli +++ b/src/singeli/src/transpose.singeli @@ -150,6 +150,14 @@ def transpose_with_kernel{T, k, kh, call_base, rp:*T, xp:*T, w, h, ws, hs} = { } } +# Interleave n values of type T from x0 and x1 into r +fn interleave{T}(r0:*void, x0:*void, x1:*void, n:u64) : void = { + rp := *T~~r0 + @for (x0 in *T~~x0, x1 in *T~~x1 over i to n) { + store{rp, i*2, x0}; store{rp, i*2+1, x1} + } +} + fn transpose{T, k, kh}(r0:*void, x0:*void, w:u64, h:u64, ws:u64, hs:u64) : void = { # Scalar transpose defined in C def ts = if (T==i8) 'i8' else if (T==i16) 'i16' else if (T==i32) 'i32' else 'i64' @@ -160,7 +168,7 @@ fn transpose{T, k, kh}(r0:*void, x0:*void, w:u64, h:u64, ws:u64, hs:u64) : void if (hasarch{'AVX2'} and w>=k and h>=k) { transpose_with_kernel{T, k, kh, call_base, rp, xp, w, h, ws, hs} } else { - if (h==2 and h==hs) @for (x0 in xp, x1 in xp+ws over i to w) { store{rp, i*2, x0}; store{rp, i*2+1, x1} } + if (h==2 and h==hs) interleave{T}(r0, x0, *void~~(xp+ws), w) else if (w==2 and w==ws) @for (r0 in rp, r1 in rp+hs over i to h) { r0 = load{xp, i*2}; r1 = load{xp, i*2+1} } else call_base{rp, xp, w, h} } @@ -174,3 +182,5 @@ exportT{'simd_transpose', tup{ transpose{i32, 8}, transpose{i64, 4} }} + +exportT{'si_interleave', each{interleave, tup{i8, i16, i32, i64}}} |