Merge pull request #102 from mlochbaum/misc

Miscellaneous
author: dzaima <dzaimagit@gmail.com> 2024-01-01 16:40:19 +0200
committer: GitHub <noreply@github.com> 2024-01-01 16:40:19 +0200
commit: a82d3e1c68143b750906bfd2a6396d83b9b9b97e (patch)
tree: c8cc03c527125266d4161ea1037828c7c99b7b09
parent: 9931c1756c8f775f9a1e0abf813351a3a52d88f8 (diff)
parent: dbb6fbade19c1bc367fd6e4faf9dd5d4aa893a4b (diff)
8 files changed, 192 insertions, 88 deletions
diff --git a/build/src/build.bqn b/build/src/build.bqn
index e8c4981d..615e8526 100755
--- a/build/src/build.bqn
+++ b/build/src/build.bqn
@@ -659,10 +659,10 @@ cachedBin‿linkerCache ← {
     "xa."‿"src/utils/bits.c"‿"bits",          "xag"‿"src/builtins/transpose.c"‿"transpose",
     "xag"‿"src/builtins/search.c"‿"search",   "xag"‿"src/builtins/selfsearch.c"‿"selfsearch"
     "xag"‿"src/builtins/scan.c"‿"scan",       "xa."‿"src/builtins/fold.c"‿"fold",
-    "xag"‿"src/builtins/sort.c"‿"bins"
+    "xag"‿"src/builtins/slash.c"‿"slash",     "xag"‿"src/builtins/slash.c"‿"replicate",
+    "xag"‿"src/builtins/sort.c"‿"bins",       "xa."‿"src/builtins/slash.c"‿"count"
     
-    "x.."‿"src/builtins/select.c"‿"select",   "xa."‿"src/builtins/slash.c"‿"constrep",
-    "xag"‿"src/builtins/slash.c"‿"slash",     "xa."‿"src/builtins/slash.c"‿"count"
+    "x.."‿"src/builtins/select.c"‿"select"
   ⟩
   objs ← ⟨⟩
   
@@ -694,10 +694,12 @@ cachedBin‿linkerCache ← {
       •file.Name ga, ⟨⟩
     ⟩
     
-    singeliArgs ← po.singeliFlags∾⟨"-l", "gen="∾AtRoot singeliCache.folder, "-c", "usz=u"∾•Repr po.usz⟩∾{
-      po.native? ⟨⟩;
-      "-a" ⋈ 1↓∾ ','⊸∾¨ po.singeliArch
-    }
+    singeliArgs ← ∾⟨
+      po.singeliFlags
+      ⟨"-l", "gen="∾AtRoot singeliCache.folder, "-c", "usz=u"∾•Repr po.usz⟩
+      (⊑"slow-pdep"<⊸∊po.has)/⟨"-c", "SLOW_PDEP=1"⟩
+      {po.native? ⟨⟩; "-a" ⋈ 1↓∾ ','⊸∾¨ po.singeliArch}
+    ⟩
     {𝕊: "Singeli args: "∾•Repr singeliArgs} _verboseLog @
     {𝕊: "Singeli-required C args: "∾•Repr po.siCFlags; @} _verboseLog @
     singeliObjs ↩ {MakeSingeliInv ⟨singeliArgs, {𝕊:UpdateSubmodule po.singeliDir}, singeliCache, 𝕩, "src/singeli/src/"•file.At 𝕩∾".singeli", (𝕩≡"dyarith")/⟨gaRule⟩⟩}¨ 1⊑¨singeliMap
diff --git a/src/builtins/cells.c b/src/builtins/cells.c
index 1ff71961..354fb8f5 100644
--- a/src/builtins/cells.c
+++ b/src/builtins/cells.c
@@ -9,6 +9,7 @@ B shape_c2(B, B, B);
 B transp_c2(B, B, B);
 B fold_rows(Md1D* d, B x); // from fold.c
 B takedrop_highrank(bool take, B w, B x); // from sfns.c
+B try_interleave_cells(B w, B x, ur xr, ur xk, usz* xsh); // from transpose.c
 
 // X - variable name; XSH - its shape; K - number of leading axes that get iterated over; SLN - number of slices that will be made; DX - additional refcount count to add to x
 #define S_KSLICES(X, XSH, K, SLN, DX)\
@@ -609,6 +610,10 @@ NOINLINE B for_cells_AA(B f, B w, B x, ur wcr, ur xcr, u32 chr) {
         if (rsh) shcpy(rsh, zsh, zk);
         decG(w); decG(x); return taga(r);
       }
+      if (rtid==n_couple && wr==xr) {
+        B r = try_interleave_cells(w, x, xr, xk, xsh);
+        if (!q_N(r)) { decG(w); decG(x); return r; }
+      }
     }
     if (isPervasiveDy(f)) {
       if (TI(w,elType)==el_B || TI(x,elType)==el_B) goto generic;
diff --git a/src/builtins/slash.c b/src/builtins/slash.c
index d1dcc5d0..90b9c924 100644
--- a/src/builtins/slash.c
+++ b/src/builtins/slash.c
@@ -86,12 +86,13 @@
 #endif
 
 #if SINGELI
+  extern void (*const si_scan_pluswrap_u8)(uint8_t* v0,uint8_t* v1,uint64_t v2,uint8_t v3);
+  extern void (*const si_scan_pluswrap_u16)(uint16_t* v0,uint16_t* v1,uint64_t v2,uint16_t v3);
+  extern void (*const si_scan_pluswrap_u32)(uint32_t* v0,uint32_t* v1,uint64_t v2,uint32_t v3);
+  extern void (*const si_scan_max_i32)(int32_t* v0,int32_t* v1,uint64_t v2);
   #define SINGELI_FILE slash
   #include "../utils/includeSingeli.h"
-#endif
-
-#if SINGELI_AVX2 || SINGELI_NEON
-  #define SINGELI_FILE constrep
+  #define SINGELI_FILE replicate
   #include "../utils/includeSingeli.h"
 #endif
 
@@ -100,20 +101,6 @@
   #include "../utils/includeSingeli.h"
 #endif
 
-#if SINGELI
-  extern void (*const si_scan_pluswrap_u8)(uint8_t* v0,uint8_t* v1,uint64_t v2,uint8_t v3);
-  extern void (*const si_scan_pluswrap_u16)(uint16_t* v0,uint16_t* v1,uint64_t v2,uint16_t v3);
-  extern void (*const si_scan_pluswrap_u32)(uint32_t* v0,uint32_t* v1,uint64_t v2,uint32_t v3);
-  #define ALIAS(I,U) static void si_scan_pluswrap_##I(I* a, I* b, u64 c, I d) { si_scan_pluswrap_##U((U*)a, (U*)b, c, d); }
-  ALIAS(i8,u8) ALIAS(i16,u16) ALIAS(i32,u32)
-  #undef ALIAS
-  #define si_scan_pluswrap_u64(V0,V1,V2,V3) for (usz i=k; i<e; i++) js=rp[i]+=js;
-  #define PLUS_SCAN(T) si_scan_pluswrap_##T(rp+k,rp+k,e-k,js); js=rp[e-1];
-  extern void (*const si_scan_max_i32)(int32_t* v0,int32_t* v1,uint64_t v2);
-#else
-  #define PLUS_SCAN(T) for (usz i=k; i<e; i++) js=rp[i]+=js;
-#endif
-
 // Dense Where, still significantly worse than SIMD
 // Assumes modifiable DST
 #define WHERE_DENSE(SRC, DST, LEN, OFF) do { \
@@ -537,32 +524,6 @@ static B compress(B w, B x, usz wia, u8 xl, u8 xt) {
   return r;
 }
 
-// Replicate using plus/max/xor-scan
-#define SCAN_CORE(WV, UPD, SET, SCAN) \
-  usz b = 1<<10;                       \
-  for (usz k=0, j=0, ij=WV; ; ) {      \
-    usz e = b<s-k? k+b : s;            \
-    for (usz i=k; i<e; i++) rp[i]=0;   \
-    SET;                               \
-    while (ij<e) { j++; UPD; ij+=WV; } \
-    SCAN;                              \
-    if (e==s) {break;}  k=e;           \
-  }
-#define SUM_CORE(T, WV, PREP, INC) \
-  SCAN_CORE(WV, PREP; rp[ij]+=INC, , PLUS_SCAN(T))
-
-#if SINGELI_AVX2
-  #define IND_BY_SCAN \
-    SCAN_CORE(xp[j], rp[ij]=j, rp[k]=j, si_scan_max_i32(rp+k,rp+k,e-k))
-#else
-  #define IND_BY_SCAN usz js=0; SUM_CORE(i32, xp[j], , 1)
-#endif
-
-#define REP_BY_SCAN(T, WV) \
-  T* xp = xv; T* rp = rv;                 \
-  T js=xp[0], px=js;                      \
-  SUM_CORE(T, WV, T sx=px, (px=xp[j])-sx)
-
 #define BOOL_REP_XOR_SCAN(WV) \
   usz b = 1<<12;                                       \
   u64 xx=xp[0], xs=xx>>63, js=-(xx&1); xx^=xx<<1;      \
@@ -622,14 +583,13 @@ B slash_c1(B t, B x) {
       for (u64 j = 0; j < c; j++) *rp++ = i;
     }
   } else {
+    #if SINGELI
     if (s/32 <= xia) { // Sparse case: type of x matters
-      #define SPARSE_IND(T) T* xp = T##any_ptr(x); IND_BY_SCAN
       i32* rp; r = m_i32arrv(&rp, s);
-      if      (xe == el_i8 ) { SPARSE_IND(i8 ); }
-      else if (xe == el_i16) { SPARSE_IND(i16); }
-      else                   { SPARSE_IND(i32); }
-      #undef SPARSE_IND
-    } else { // Dense case: only result type matters
+      si_indices_scan_i32[elwByteLog(xe)](tyany_ptr(x), rp, s);
+    } else
+    #endif
+    { // Dense case: only result type matters
       #define DENSE_IND(T) \
         T* rp; r = m_##T##arrv(&rp, s);          \
         for (u64 i = 0; i < xia; i++) {          \
@@ -748,17 +708,12 @@ B slash_c2(B t, B w, B x) {
       void* rv = m_tyarrlv(&r, xk, s, xt);
       if (rsh) { Arr* ra=a(r); SPRNK(ra,xr); ra->sh = rsh; ra->ia = s*arr_csz(x); }
       void* xv = tyany_ptr(x);
+      #if SINGELI
       if ((xk<3? s/64 : s/32) <= wia) { // Sparse case: use both types
-        #define CASE(L,XT) case L: { REP_BY_SCAN(XT, wp[j]) break; }
-        #define SPARSE_REP(WT) \
-          WT* wp = WT##any_ptr(w);                \
-          switch (xk) { default: UD; CASE(0,u8) CASE(1,u16) CASE(2,u32) CASE(3,u64) }
-        if      (we == el_i8 ) { SPARSE_REP(i8 ); }
-        else if (we == el_i16) { SPARSE_REP(i16); }
-        else                   { SPARSE_REP(i32); }
-        #undef SPARSE_REP
-        #undef CASE
-      } else { // Dense case: only type of x matters
+        si_replicate_scan[4*elwByteLog(we) + xk](tyany_ptr(w), xv, rv, s);
+      } else
+      #endif
+      { // Dense case: only type of x matters
         #define CASE(L,T) case L: { \
           T* xp = xv; T* rp = rv;                    \
           for (usz i = 0; i < wia; i++) {            \
@@ -845,13 +800,18 @@ B slash_c2(B t, B w, B x) {
       u8 xk = xl-3;
       void* rv = m_tyarrv(&r, 1<<xk, s, xt);
       void* xv = tyany_ptr(x);
-      #if SINGELI_AVX2 || SINGELI_NEON
-      simd_constrep[xk](wv, xv, rv, xlen);
+      #if SINGELI
+      si_constrep[xk](wv, xv, rv, xlen);
       #else
-      #define CASE(L,T) case L: { REP_BY_SCAN(T, wv) break; }
+      #define CASE(L,T) case L: {                       \
+          T* xp = xv; T* rp = rv;                       \
+          for (usz i = 0; i < xlen; i++) {              \
+            for (i64 j = 0; j < wv; j++) *rp++ = xp[i]; \
+          }                                             \
+        } break;
       switch (xk) { default: UD; CASE(0,u8) CASE(1,u16) CASE(2,u32) CASE(3,u64) }
-      #endif
       #undef CASE
+      #endif
     }
     
     atmW_maybesh:;
diff --git a/src/builtins/transpose.c b/src/builtins/transpose.c
index 331b0127..63dba290 100644
--- a/src/builtins/transpose.c
+++ b/src/builtins/transpose.c
@@ -2,7 +2,7 @@
 
 // Transpose
 // One length-2 axis: dedicated code
-//   Boolean: pdep for height 2; pext for width 2
+//   Boolean: pdep or emulation for height 2; pext for width 2
 //     SHOULD use a generic implementation if BMI2 not present
 // SHOULD optimize other short lengths with pdep/pext and shuffles
 // Boolean 𝕩: convert to integer
@@ -40,6 +40,9 @@
 #include "../utils/calls.h"
 
 #ifdef __BMI2__
+  #if !SLOW_PDEP
+    #define FAST_PDEP 1
+  #endif
   #include <immintrin.h>
   #if USE_VALGRIND
     #define _pdep_u64 vg_pdep_u64
@@ -66,6 +69,67 @@ typedef void (*TranspFn)(void*,void*,u64,u64,u64,u64);
 #endif
 
 
+static void interleave_bits(u64* rp, void* x0v, void* x1v, usz n) {
+  u32* x0 = (u32*)x0v; u32* x1 = (u32*)x1v;
+  for (usz i=0; i<BIT_N(n); i++) {
+    #if FAST_PDEP
+    rp[i] = _pdep_u64(x0[i], 0x5555555555555555) | _pdep_u64(x1[i], 0xAAAAAAAAAAAAAAAA);
+    #else
+    #define STEP(V,M,SH) V = (V | V<<SH) & M;
+    #define EXPAND(V) \
+      STEP(V, 0x0000ffff0000ffff, 16) \
+      STEP(V, 0x00ff00ff00ff00ff,  8) \
+      STEP(V, 0x0f0f0f0f0f0f0f0f,  4) \
+      STEP(V, 0x3333333333333333,  2) \
+      STEP(V, 0x5555555555555555,  1)
+    u64 e0 = x0[i]; EXPAND(e0);
+    u64 e1 = x1[i]; EXPAND(e1);
+    rp[i] = e0 | e1<<1;
+    #undef EXPAND
+    #undef STEP
+    #endif
+  }
+}
+
+// Interleave arrays, 𝕨≍⎉(-xk)𝕩. Doesn't consume.
+// Return bi_N if there isn't fast code.
+B try_interleave_cells(B w, B x, ur xr, ur xk, usz* xsh) {
+  assert(RNK(w)==xr && xr>=1);
+  u8 xe = TI(x,elType); if (xe!=TI(w,elType)) return bi_N;
+  usz csz = shProd(xsh, xk, xr);
+  if (csz & (csz-1)) return bi_N; // Not power of 2
+  u8 xlw = elwBitLog(xe);
+  usz n = shProd(xsh, 0, xk);
+  usz ia = 2*n*csz;
+  Arr *r;
+  if (csz==1 && xlw==0) {
+    u64* rp; r=m_bitarrp(&rp, ia);
+    interleave_bits(rp, bitarr_ptr(w), bitarr_ptr(x), ia);
+  } else
+  #if SINGELI
+  if (csz==1 && xe==el_B) {
+    B* wp = TO_BPTR(w); B* xp = TO_BPTR(x);
+    HArr_p p = m_harrUv(ia); // Debug build complains with harrUp
+    si_interleave[3](p.a, wp, xp, n);
+    for (usz i=0; i<ia; i++) inc(p.a[i]);
+    NOGC_E;
+    B rb = p.b;
+    if (SFNS_FILLS) rb = qWithFill(rb, fill_both(w, x));
+    r = a(rb);
+  } else if (csz<=64>>xlw && csz<<xlw>=8) { // Require CPU-sized cells
+    assert(xe!=el_B);
+    void* rv;
+    if (xlw==0) { u64* rp; r = m_bitarrp(&rp, ia); rv=rp; }
+    else rv = m_tyarrp(&r,elWidth(xe),ia,el2t(xe));
+    si_interleave[CTZ(csz<<xlw)-3](rv, tyany_ptr(w), tyany_ptr(x), n);
+  } else
+  #endif
+  return bi_N;
+  usz* sh = arr_shAlloc(r, xr+1);
+  shcpy(sh, xsh, xk); sh[xk]=2; shcpy(sh+xk+1, xsh+xk, xr-xk);
+  return taga(r);
+}
+
 static void transpose_move(void* rv, void* xv, u8 xe, usz w, usz h) {
   assert(xe!=el_bit); assert(xe!=el_B);
   transposeFns[elwByteLog(xe)](rv, xv, w, h, w, h);
@@ -87,14 +151,12 @@ static Arr* transpose_noshape(B* px, usz ia, usz w, usz h) {
     
     r=a(qWithFill(p.b, xf));
   } else if (xe==el_bit) {
-    #ifdef __BMI2__
     if (h==2) {
-      u32* x0 = (u32*)bitarr_ptr(x);
       u64* rp; r=m_bitarrp(&rp, ia);
       Arr* x1o = TI(x,slice)(inc(x),w,w);
-      u32* x1 = (u32*) ((TyArr*)x1o)->a;
-      for (usz i=0; i<BIT_N(ia); i++) rp[i] = _pdep_u64(x0[i], 0x5555555555555555) | _pdep_u64(x1[i], 0xAAAAAAAAAAAAAAAA);
+      interleave_bits(rp, bitarr_ptr(x), ((TyArr*)x1o)->a, ia);
       mm_free((Value*)x1o);
+    #ifdef __BMI2__
     } else if (w==2) {
       u64* xp = bitarr_ptr(x);
       u64* r0; r=m_bitarrp(&r0, ia);
@@ -106,9 +168,8 @@ static Arr* transpose_noshape(B* px, usz ia, usz w, usz h) {
       }
       bit_cpyN(r0, h, r1, 0, h);
       TFREE(r1);
-    } else
     #endif
-    {
+    } else {
       *px = x = taga(cpyI8Arr(x)); xe=el_i8;
       void* rv = m_tyarrp(&r,elWidth(xe),ia,el2t(xe));
       void* xv = tyany_ptr(x);
diff --git a/src/singeli/src/base.singeli b/src/singeli/src/base.singeli
index 68ded7f3..d5e11120 100644
--- a/src/singeli/src/base.singeli
+++ b/src/singeli/src/base.singeli
@@ -5,6 +5,7 @@ include 'util/kind'
 
 def ux = u64
 config usz = u32
+config SLOW_PDEP = 0
 
 def same = is
 oper ~~ reinterpret infix right 55
@@ -157,6 +158,7 @@ def base{b,l} = if (0==tuplen{l}) 0 else tupsel{0,l}+b*base{b,slice{l,1}}
 # vector definitions
 def arch_defvw = if (hasarch{'AVX2'}) 256 else 128
 def has_simd = hasarch{'X86_64'} | hasarch{'AARCH64'}
+def fast_BMI2{} = if (SLOW_PDEP) 0 else hasarch{'BMI2'}
 
 # test if vector has a specific width & element type
 def lvec{T, n, w} = 0
diff --git a/src/singeli/src/constrep.singeli b/src/singeli/src/replicate.singeli
index 28e6b0a0..82c981ea 100644
--- a/src/singeli/src/constrep.singeli
+++ b/src/singeli/src/replicate.singeli
@@ -1,8 +1,72 @@
 include './base'
-if (hasarch{'AVX2'} | hasarch{'AARCH64'}) {
-
 include './mask'
 include 'util/tup'
+
+def ind_types = tup{i8, i16, i32}
+def dat_types = tup{...ind_types, u64}
+
+# Indices and Replicate using plus- or max-scan
+def scan_core{upd, set, scan, rp:pT, wp:W, s:(usz)} = {
+  def getw{j} = if (isptr{W}) cast_i{usz,load{wp,j}} else wp
+  b:usz = 1<<10
+  k:usz = 0; j:usz = 0; ij:=getw{j}
+  while (1) {
+    e := tern{b<s-k, k+b, s}
+    @for (rp over i from k to e) rp = 0
+    if (set) store{rp, k, cast_i{eltype{pT},j}}
+    while (ij<e) { ++j; upd{rp, j, ij}; ij+=getw{j} }
+    scan{rp+k, e-k}
+    if (e==s) return{}
+    k = e
+  }
+}
+def indrep_by_sum{T, rp:*T, wp, s:(usz), js, inc} = {
+  def scan{ptr, len} = @for (ptr over len) js=ptr+=js
+  def scan{ptr, len & width{T}<=32} = {
+    def scanfn = merge{'si_scan_pluswrap_u',fmtnat{width{T}}}
+    p := *ty_u{eltype{type{ptr}}}~~ptr
+    emit{void, scanfn, p, p, len, js}; js=load{ptr,len-1}
+  }
+  def upd{rp, j, ij} = store{rp, ij, load{rp,ij}+inc{j}}
+  scan_core{upd, 0, scan, rp, wp, s}
+}
+
+fn ind_by_scan_i32{W}(xv:*void, rp:*i32, s:usz) : void = {
+  xp := *W~~xv
+  if (hasarch{'X86_64'} & ~hasarch{'SSE4.1'}) { # no min instruction
+    js:i32 = 0
+    indrep_by_sum{i32, rp, xp, s, js, {j}=>1}
+  } else {
+    scan_core{
+      {rp,j,ij} => store{rp,ij,cast_i{i32,j}}, 1,
+      {ptr,len} => emit{void, 'si_scan_max_i32', ptr,ptr,len},
+      rp, xp, s
+    }
+  }
+}
+
+def rep_by_scan{T, wp, xv:*void, rv:*void, s} = {
+  xp := *T~~xv; js := *xp; px := js
+  def inc{j} = {sx:=px; px=load{xp,j}; px-sx}
+  indrep_by_sum{T, *T~~rv, wp, s, js, inc}
+}
+fn rep_by_scan{W, T}(wp:*void, xv:*void, rv:*void, s:usz) : void = {
+  rep_by_scan{T, *W~~wp, xv, rv, s}
+}
+
+exportT{'si_indices_scan_i32', each{ind_by_scan_i32, ind_types}}
+exportT{'si_replicate_scan', flat_table{rep_by_scan, ind_types, dat_types}}
+
+
+# Constant replicate
+if (not (hasarch{'AVX2'} | hasarch{'AARCH64'})) {
+
+fn rep_const{T}(wv:u64, x:*void, r:*void, n:u64) : void = {
+  rep_by_scan{T, cast_i{usz,wv}, x, r, cast_i{usz, wv*n}}
+}
+
+} else {
+
 def incl{a,b} = slice{iota{b+1},a}
 
 # 1+˝∨`⌾⌽0=div|⌜range
@@ -220,6 +284,6 @@ fn rep_const{T}(wv:u64, x:*void, r:*void, n:u64) : void = {
   }
 }
 
-exportT{'simd_constrep', each{rep_const, tup{i8, i16, i32, u64}}}
+}
 
-}
-\ No newline at end of file
+exportT{'si_constrep', each{rep_const, dat_types}}
diff --git a/src/singeli/src/slash.singeli b/src/singeli/src/slash.singeli
index e1560dd4..e66b97fa 100644
--- a/src/singeli/src/slash.singeli
+++ b/src/singeli/src/slash.singeli
@@ -339,8 +339,8 @@ def pext_popc{x:T, m:T} = {
   tup{pe, scal{w} - z}
 }
 
-def pext_width {..._ & hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 2
-def thresh_bool{..._ & hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 32
+def pext_width {& hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 2
+def thresh_bool{& hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 32
 def pext_popc{x0:V, m0:V & hasarch{'PCLMUL'} & V==[2]u64} = {
   def clmul{a, b} = zipLo{...@collect (j to 2) clmul{a,b,j}}
   m := m0
@@ -359,9 +359,9 @@ def pext_popc{x0:V, m0:V & hasarch{'PCLMUL'} & V==[2]u64} = {
   tup{x, @collect (j to 2) popc{extract{m0,j}}}
 }
 
-def pext_width {..._ & hasarch{'BMI2'}} = 1
-def thresh_bool{..._ & hasarch{'BMI2'}} = 512
-def pext_popc{x:T, m:T & hasarch{'BMI2'} & T==u64} = tup{pext{x, m}, popc{m}}
+def pext_width {& fast_BMI2{}} = 1
+def thresh_bool{& fast_BMI2{}} = 512
+def pext_popc{x:T, m:T & fast_BMI2{} & T==u64} = tup{pext{x, m}, popc{m}}
 
 fn compress_bool(w:*u64, x:*u64, r:*u64, n:u64) : void = {
   cw:u64 = 0; # current word
diff --git a/src/singeli/src/transpose.singeli b/src/singeli/src/transpose.singeli
index 08dbc977..4fe11838 100644
--- a/src/singeli/src/transpose.singeli
+++ b/src/singeli/src/transpose.singeli
@@ -150,6 +150,14 @@ def transpose_with_kernel{T, k, kh, call_base, rp:*T, xp:*T, w, h, ws, hs} = {
   }
 }
 
+# Interleave n values of type T from x0 and x1 into r
+fn interleave{T}(r0:*void, x0:*void, x1:*void, n:u64) : void = {
+  rp := *T~~r0
+  @for (x0 in *T~~x0, x1 in *T~~x1 over i to n) {
+    store{rp, i*2, x0}; store{rp, i*2+1, x1}
+  }
+}
+
 fn transpose{T, k, kh}(r0:*void, x0:*void, w:u64, h:u64, ws:u64, hs:u64) : void = {
   # Scalar transpose defined in C
   def ts = if (T==i8) 'i8' else if (T==i16) 'i16' else if (T==i32) 'i32' else 'i64'
@@ -160,7 +168,7 @@ fn transpose{T, k, kh}(r0:*void, x0:*void, w:u64, h:u64, ws:u64, hs:u64) : void
   if (hasarch{'AVX2'} and w>=k and h>=k) {
     transpose_with_kernel{T, k, kh, call_base, rp, xp, w, h, ws, hs}
   } else {
-    if      (h==2 and h==hs) @for (x0 in xp, x1 in xp+ws over i to w) { store{rp, i*2, x0}; store{rp, i*2+1, x1} }
+    if      (h==2 and h==hs) interleave{T}(r0, x0, *void~~(xp+ws), w)
     else if (w==2 and w==ws) @for (r0 in rp, r1 in rp+hs over i to h) { r0 = load{xp, i*2}; r1 = load{xp, i*2+1} }
     else call_base{rp, xp, w, h}
   }
@@ -174,3 +182,5 @@ exportT{'simd_transpose', tup{
   transpose{i32, 8},
   transpose{i64, 4}
 }}
+
+exportT{'si_interleave', each{interleave, tup{i8, i16, i32, i64}}}
author	dzaima <dzaimagit@gmail.com>	2024-01-01 16:40:19 +0200
committer	GitHub <noreply@github.com>	2024-01-01 16:40:19 +0200
commit	a82d3e1c68143b750906bfd2a6396d83b9b9b97e (patch)
tree	c8cc03c527125266d4161ea1037828c7c99b7b09
parent	9931c1756c8f775f9a1e0abf813351a3a52d88f8 (diff)
parent	dbb6fbade19c1bc367fd6e4faf9dd5d4aa893a4b (diff)