summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordzaima <dzaimagit@gmail.com>2024-01-01 16:40:19 +0200
committerGitHub <noreply@github.com>2024-01-01 16:40:19 +0200
commita82d3e1c68143b750906bfd2a6396d83b9b9b97e (patch)
treec8cc03c527125266d4161ea1037828c7c99b7b09
parent9931c1756c8f775f9a1e0abf813351a3a52d88f8 (diff)
parentdbb6fbade19c1bc367fd6e4faf9dd5d4aa893a4b (diff)
Merge pull request #102 from mlochbaum/misc
Miscellaneous
-rwxr-xr-xbuild/src/build.bqn16
-rw-r--r--src/builtins/cells.c5
-rw-r--r--src/builtins/slash.c88
-rw-r--r--src/builtins/transpose.c75
-rw-r--r--src/singeli/src/base.singeli2
-rw-r--r--src/singeli/src/replicate.singeli (renamed from src/singeli/src/constrep.singeli)72
-rw-r--r--src/singeli/src/slash.singeli10
-rw-r--r--src/singeli/src/transpose.singeli12
8 files changed, 192 insertions, 88 deletions
diff --git a/build/src/build.bqn b/build/src/build.bqn
index e8c4981d..615e8526 100755
--- a/build/src/build.bqn
+++ b/build/src/build.bqn
@@ -659,10 +659,10 @@ cachedBin‿linkerCache ← {
"xa."‿"src/utils/bits.c"‿"bits", "xag"‿"src/builtins/transpose.c"‿"transpose",
"xag"‿"src/builtins/search.c"‿"search", "xag"‿"src/builtins/selfsearch.c"‿"selfsearch"
"xag"‿"src/builtins/scan.c"‿"scan", "xa."‿"src/builtins/fold.c"‿"fold",
- "xag"‿"src/builtins/sort.c"‿"bins"
+ "xag"‿"src/builtins/slash.c"‿"slash", "xag"‿"src/builtins/slash.c"‿"replicate",
+ "xag"‿"src/builtins/sort.c"‿"bins", "xa."‿"src/builtins/slash.c"‿"count"
- "x.."‿"src/builtins/select.c"‿"select", "xa."‿"src/builtins/slash.c"‿"constrep",
- "xag"‿"src/builtins/slash.c"‿"slash", "xa."‿"src/builtins/slash.c"‿"count"
+ "x.."‿"src/builtins/select.c"‿"select"
objs ← ⟨⟩
@@ -694,10 +694,12 @@ cachedBin‿linkerCache ← {
•file.Name ga, ⟨⟩
- singeliArgs ← po.singeliFlags∾⟨"-l", "gen="∾AtRoot singeliCache.folder, "-c", "usz=u"∾•Repr po.usz⟩∾{
- po.native? ⟨⟩;
- "-a" ⋈ 1↓∾ ','⊸∾¨ po.singeliArch
- }
+ singeliArgs ← ∾⟨
+ po.singeliFlags
+ ⟨"-l", "gen="∾AtRoot singeliCache.folder, "-c", "usz=u"∾•Repr po.usz⟩
+ (⊑"slow-pdep"<⊸∊po.has)/⟨"-c", "SLOW_PDEP=1"⟩
+ {po.native? ⟨⟩; "-a" ⋈ 1↓∾ ','⊸∾¨ po.singeliArch}
+ ⟩
{𝕊: "Singeli args: "∾•Repr singeliArgs} _verboseLog @
{𝕊: "Singeli-required C args: "∾•Repr po.siCFlags; @} _verboseLog @
singeliObjs ↩ {MakeSingeliInv ⟨singeliArgs, {𝕊:UpdateSubmodule po.singeliDir}, singeliCache, 𝕩, "src/singeli/src/"•file.At 𝕩∾".singeli", (𝕩≡"dyarith")/⟨gaRule⟩⟩}¨ 1⊑¨singeliMap
diff --git a/src/builtins/cells.c b/src/builtins/cells.c
index 1ff71961..354fb8f5 100644
--- a/src/builtins/cells.c
+++ b/src/builtins/cells.c
@@ -9,6 +9,7 @@ B shape_c2(B, B, B);
B transp_c2(B, B, B);
B fold_rows(Md1D* d, B x); // from fold.c
B takedrop_highrank(bool take, B w, B x); // from sfns.c
+B try_interleave_cells(B w, B x, ur xr, ur xk, usz* xsh); // from transpose.c
// X - variable name; XSH - its shape; K - number of leading axes that get iterated over; SLN - number of slices that will be made; DX - additional refcount count to add to x
#define S_KSLICES(X, XSH, K, SLN, DX)\
@@ -609,6 +610,10 @@ NOINLINE B for_cells_AA(B f, B w, B x, ur wcr, ur xcr, u32 chr) {
if (rsh) shcpy(rsh, zsh, zk);
decG(w); decG(x); return taga(r);
}
+ if (rtid==n_couple && wr==xr) {
+ B r = try_interleave_cells(w, x, xr, xk, xsh);
+ if (!q_N(r)) { decG(w); decG(x); return r; }
+ }
}
if (isPervasiveDy(f)) {
if (TI(w,elType)==el_B || TI(x,elType)==el_B) goto generic;
diff --git a/src/builtins/slash.c b/src/builtins/slash.c
index d1dcc5d0..90b9c924 100644
--- a/src/builtins/slash.c
+++ b/src/builtins/slash.c
@@ -86,12 +86,13 @@
#endif
#if SINGELI
+ extern void (*const si_scan_pluswrap_u8)(uint8_t* v0,uint8_t* v1,uint64_t v2,uint8_t v3);
+ extern void (*const si_scan_pluswrap_u16)(uint16_t* v0,uint16_t* v1,uint64_t v2,uint16_t v3);
+ extern void (*const si_scan_pluswrap_u32)(uint32_t* v0,uint32_t* v1,uint64_t v2,uint32_t v3);
+ extern void (*const si_scan_max_i32)(int32_t* v0,int32_t* v1,uint64_t v2);
#define SINGELI_FILE slash
#include "../utils/includeSingeli.h"
-#endif
-
-#if SINGELI_AVX2 || SINGELI_NEON
- #define SINGELI_FILE constrep
+ #define SINGELI_FILE replicate
#include "../utils/includeSingeli.h"
#endif
@@ -100,20 +101,6 @@
#include "../utils/includeSingeli.h"
#endif
-#if SINGELI
- extern void (*const si_scan_pluswrap_u8)(uint8_t* v0,uint8_t* v1,uint64_t v2,uint8_t v3);
- extern void (*const si_scan_pluswrap_u16)(uint16_t* v0,uint16_t* v1,uint64_t v2,uint16_t v3);
- extern void (*const si_scan_pluswrap_u32)(uint32_t* v0,uint32_t* v1,uint64_t v2,uint32_t v3);
- #define ALIAS(I,U) static void si_scan_pluswrap_##I(I* a, I* b, u64 c, I d) { si_scan_pluswrap_##U((U*)a, (U*)b, c, d); }
- ALIAS(i8,u8) ALIAS(i16,u16) ALIAS(i32,u32)
- #undef ALIAS
- #define si_scan_pluswrap_u64(V0,V1,V2,V3) for (usz i=k; i<e; i++) js=rp[i]+=js;
- #define PLUS_SCAN(T) si_scan_pluswrap_##T(rp+k,rp+k,e-k,js); js=rp[e-1];
- extern void (*const si_scan_max_i32)(int32_t* v0,int32_t* v1,uint64_t v2);
-#else
- #define PLUS_SCAN(T) for (usz i=k; i<e; i++) js=rp[i]+=js;
-#endif
-
// Dense Where, still significantly worse than SIMD
// Assumes modifiable DST
#define WHERE_DENSE(SRC, DST, LEN, OFF) do { \
@@ -537,32 +524,6 @@ static B compress(B w, B x, usz wia, u8 xl, u8 xt) {
return r;
}
-// Replicate using plus/max/xor-scan
-#define SCAN_CORE(WV, UPD, SET, SCAN) \
- usz b = 1<<10; \
- for (usz k=0, j=0, ij=WV; ; ) { \
- usz e = b<s-k? k+b : s; \
- for (usz i=k; i<e; i++) rp[i]=0; \
- SET; \
- while (ij<e) { j++; UPD; ij+=WV; } \
- SCAN; \
- if (e==s) {break;} k=e; \
- }
-#define SUM_CORE(T, WV, PREP, INC) \
- SCAN_CORE(WV, PREP; rp[ij]+=INC, , PLUS_SCAN(T))
-
-#if SINGELI_AVX2
- #define IND_BY_SCAN \
- SCAN_CORE(xp[j], rp[ij]=j, rp[k]=j, si_scan_max_i32(rp+k,rp+k,e-k))
-#else
- #define IND_BY_SCAN usz js=0; SUM_CORE(i32, xp[j], , 1)
-#endif
-
-#define REP_BY_SCAN(T, WV) \
- T* xp = xv; T* rp = rv; \
- T js=xp[0], px=js; \
- SUM_CORE(T, WV, T sx=px, (px=xp[j])-sx)
-
#define BOOL_REP_XOR_SCAN(WV) \
usz b = 1<<12; \
u64 xx=xp[0], xs=xx>>63, js=-(xx&1); xx^=xx<<1; \
@@ -622,14 +583,13 @@ B slash_c1(B t, B x) {
for (u64 j = 0; j < c; j++) *rp++ = i;
}
} else {
+ #if SINGELI
if (s/32 <= xia) { // Sparse case: type of x matters
- #define SPARSE_IND(T) T* xp = T##any_ptr(x); IND_BY_SCAN
i32* rp; r = m_i32arrv(&rp, s);
- if (xe == el_i8 ) { SPARSE_IND(i8 ); }
- else if (xe == el_i16) { SPARSE_IND(i16); }
- else { SPARSE_IND(i32); }
- #undef SPARSE_IND
- } else { // Dense case: only result type matters
+ si_indices_scan_i32[elwByteLog(xe)](tyany_ptr(x), rp, s);
+ } else
+ #endif
+ { // Dense case: only result type matters
#define DENSE_IND(T) \
T* rp; r = m_##T##arrv(&rp, s); \
for (u64 i = 0; i < xia; i++) { \
@@ -748,17 +708,12 @@ B slash_c2(B t, B w, B x) {
void* rv = m_tyarrlv(&r, xk, s, xt);
if (rsh) { Arr* ra=a(r); SPRNK(ra,xr); ra->sh = rsh; ra->ia = s*arr_csz(x); }
void* xv = tyany_ptr(x);
+ #if SINGELI
if ((xk<3? s/64 : s/32) <= wia) { // Sparse case: use both types
- #define CASE(L,XT) case L: { REP_BY_SCAN(XT, wp[j]) break; }
- #define SPARSE_REP(WT) \
- WT* wp = WT##any_ptr(w); \
- switch (xk) { default: UD; CASE(0,u8) CASE(1,u16) CASE(2,u32) CASE(3,u64) }
- if (we == el_i8 ) { SPARSE_REP(i8 ); }
- else if (we == el_i16) { SPARSE_REP(i16); }
- else { SPARSE_REP(i32); }
- #undef SPARSE_REP
- #undef CASE
- } else { // Dense case: only type of x matters
+ si_replicate_scan[4*elwByteLog(we) + xk](tyany_ptr(w), xv, rv, s);
+ } else
+ #endif
+ { // Dense case: only type of x matters
#define CASE(L,T) case L: { \
T* xp = xv; T* rp = rv; \
for (usz i = 0; i < wia; i++) { \
@@ -845,13 +800,18 @@ B slash_c2(B t, B w, B x) {
u8 xk = xl-3;
void* rv = m_tyarrv(&r, 1<<xk, s, xt);
void* xv = tyany_ptr(x);
- #if SINGELI_AVX2 || SINGELI_NEON
- simd_constrep[xk](wv, xv, rv, xlen);
+ #if SINGELI
+ si_constrep[xk](wv, xv, rv, xlen);
#else
- #define CASE(L,T) case L: { REP_BY_SCAN(T, wv) break; }
+ #define CASE(L,T) case L: { \
+ T* xp = xv; T* rp = rv; \
+ for (usz i = 0; i < xlen; i++) { \
+ for (i64 j = 0; j < wv; j++) *rp++ = xp[i]; \
+ } \
+ } break;
switch (xk) { default: UD; CASE(0,u8) CASE(1,u16) CASE(2,u32) CASE(3,u64) }
- #endif
#undef CASE
+ #endif
}
atmW_maybesh:;
diff --git a/src/builtins/transpose.c b/src/builtins/transpose.c
index 331b0127..63dba290 100644
--- a/src/builtins/transpose.c
+++ b/src/builtins/transpose.c
@@ -2,7 +2,7 @@
// Transpose
// One length-2 axis: dedicated code
-// Boolean: pdep for height 2; pext for width 2
+// Boolean: pdep or emulation for height 2; pext for width 2
// SHOULD use a generic implementation if BMI2 not present
// SHOULD optimize other short lengths with pdep/pext and shuffles
// Boolean 𝕩: convert to integer
@@ -40,6 +40,9 @@
#include "../utils/calls.h"
#ifdef __BMI2__
+ #if !SLOW_PDEP
+ #define FAST_PDEP 1
+ #endif
#include <immintrin.h>
#if USE_VALGRIND
#define _pdep_u64 vg_pdep_u64
@@ -66,6 +69,67 @@ typedef void (*TranspFn)(void*,void*,u64,u64,u64,u64);
#endif
+static void interleave_bits(u64* rp, void* x0v, void* x1v, usz n) {
+ u32* x0 = (u32*)x0v; u32* x1 = (u32*)x1v;
+ for (usz i=0; i<BIT_N(n); i++) {
+ #if FAST_PDEP
+ rp[i] = _pdep_u64(x0[i], 0x5555555555555555) | _pdep_u64(x1[i], 0xAAAAAAAAAAAAAAAA);
+ #else
+ #define STEP(V,M,SH) V = (V | V<<SH) & M;
+ #define EXPAND(V) \
+ STEP(V, 0x0000ffff0000ffff, 16) \
+ STEP(V, 0x00ff00ff00ff00ff, 8) \
+ STEP(V, 0x0f0f0f0f0f0f0f0f, 4) \
+ STEP(V, 0x3333333333333333, 2) \
+ STEP(V, 0x5555555555555555, 1)
+ u64 e0 = x0[i]; EXPAND(e0);
+ u64 e1 = x1[i]; EXPAND(e1);
+ rp[i] = e0 | e1<<1;
+ #undef EXPAND
+ #undef STEP
+ #endif
+ }
+}
+
+// Interleave arrays, 𝕨≍⎉(-xk)𝕩. Doesn't consume.
+// Return bi_N if there isn't fast code.
+B try_interleave_cells(B w, B x, ur xr, ur xk, usz* xsh) {
+ assert(RNK(w)==xr && xr>=1);
+ u8 xe = TI(x,elType); if (xe!=TI(w,elType)) return bi_N;
+ usz csz = shProd(xsh, xk, xr);
+ if (csz & (csz-1)) return bi_N; // Not power of 2
+ u8 xlw = elwBitLog(xe);
+ usz n = shProd(xsh, 0, xk);
+ usz ia = 2*n*csz;
+ Arr *r;
+ if (csz==1 && xlw==0) {
+ u64* rp; r=m_bitarrp(&rp, ia);
+ interleave_bits(rp, bitarr_ptr(w), bitarr_ptr(x), ia);
+ } else
+ #if SINGELI
+ if (csz==1 && xe==el_B) {
+ B* wp = TO_BPTR(w); B* xp = TO_BPTR(x);
+ HArr_p p = m_harrUv(ia); // Debug build complains with harrUp
+ si_interleave[3](p.a, wp, xp, n);
+ for (usz i=0; i<ia; i++) inc(p.a[i]);
+ NOGC_E;
+ B rb = p.b;
+ if (SFNS_FILLS) rb = qWithFill(rb, fill_both(w, x));
+ r = a(rb);
+ } else if (csz<=64>>xlw && csz<<xlw>=8) { // Require CPU-sized cells
+ assert(xe!=el_B);
+ void* rv;
+ if (xlw==0) { u64* rp; r = m_bitarrp(&rp, ia); rv=rp; }
+ else rv = m_tyarrp(&r,elWidth(xe),ia,el2t(xe));
+ si_interleave[CTZ(csz<<xlw)-3](rv, tyany_ptr(w), tyany_ptr(x), n);
+ } else
+ #endif
+ return bi_N;
+ usz* sh = arr_shAlloc(r, xr+1);
+ shcpy(sh, xsh, xk); sh[xk]=2; shcpy(sh+xk+1, xsh+xk, xr-xk);
+ return taga(r);
+}
+
static void transpose_move(void* rv, void* xv, u8 xe, usz w, usz h) {
assert(xe!=el_bit); assert(xe!=el_B);
transposeFns[elwByteLog(xe)](rv, xv, w, h, w, h);
@@ -87,14 +151,12 @@ static Arr* transpose_noshape(B* px, usz ia, usz w, usz h) {
r=a(qWithFill(p.b, xf));
} else if (xe==el_bit) {
- #ifdef __BMI2__
if (h==2) {
- u32* x0 = (u32*)bitarr_ptr(x);
u64* rp; r=m_bitarrp(&rp, ia);
Arr* x1o = TI(x,slice)(inc(x),w,w);
- u32* x1 = (u32*) ((TyArr*)x1o)->a;
- for (usz i=0; i<BIT_N(ia); i++) rp[i] = _pdep_u64(x0[i], 0x5555555555555555) | _pdep_u64(x1[i], 0xAAAAAAAAAAAAAAAA);
+ interleave_bits(rp, bitarr_ptr(x), ((TyArr*)x1o)->a, ia);
mm_free((Value*)x1o);
+ #ifdef __BMI2__
} else if (w==2) {
u64* xp = bitarr_ptr(x);
u64* r0; r=m_bitarrp(&r0, ia);
@@ -106,9 +168,8 @@ static Arr* transpose_noshape(B* px, usz ia, usz w, usz h) {
}
bit_cpyN(r0, h, r1, 0, h);
TFREE(r1);
- } else
#endif
- {
+ } else {
*px = x = taga(cpyI8Arr(x)); xe=el_i8;
void* rv = m_tyarrp(&r,elWidth(xe),ia,el2t(xe));
void* xv = tyany_ptr(x);
diff --git a/src/singeli/src/base.singeli b/src/singeli/src/base.singeli
index 68ded7f3..d5e11120 100644
--- a/src/singeli/src/base.singeli
+++ b/src/singeli/src/base.singeli
@@ -5,6 +5,7 @@ include 'util/kind'
def ux = u64
config usz = u32
+config SLOW_PDEP = 0
def same = is
oper ~~ reinterpret infix right 55
@@ -157,6 +158,7 @@ def base{b,l} = if (0==tuplen{l}) 0 else tupsel{0,l}+b*base{b,slice{l,1}}
# vector definitions
def arch_defvw = if (hasarch{'AVX2'}) 256 else 128
def has_simd = hasarch{'X86_64'} | hasarch{'AARCH64'}
+def fast_BMI2{} = if (SLOW_PDEP) 0 else hasarch{'BMI2'}
# test if vector has a specific width & element type
def lvec{T, n, w} = 0
diff --git a/src/singeli/src/constrep.singeli b/src/singeli/src/replicate.singeli
index 28e6b0a0..82c981ea 100644
--- a/src/singeli/src/constrep.singeli
+++ b/src/singeli/src/replicate.singeli
@@ -1,8 +1,72 @@
include './base'
-if (hasarch{'AVX2'} | hasarch{'AARCH64'}) {
-
include './mask'
include 'util/tup'
+
+def ind_types = tup{i8, i16, i32}
+def dat_types = tup{...ind_types, u64}
+
+# Indices and Replicate using plus- or max-scan
+def scan_core{upd, set, scan, rp:pT, wp:W, s:(usz)} = {
+ def getw{j} = if (isptr{W}) cast_i{usz,load{wp,j}} else wp
+ b:usz = 1<<10
+ k:usz = 0; j:usz = 0; ij:=getw{j}
+ while (1) {
+ e := tern{b<s-k, k+b, s}
+ @for (rp over i from k to e) rp = 0
+ if (set) store{rp, k, cast_i{eltype{pT},j}}
+ while (ij<e) { ++j; upd{rp, j, ij}; ij+=getw{j} }
+ scan{rp+k, e-k}
+ if (e==s) return{}
+ k = e
+ }
+}
+def indrep_by_sum{T, rp:*T, wp, s:(usz), js, inc} = {
+ def scan{ptr, len} = @for (ptr over len) js=ptr+=js
+ def scan{ptr, len & width{T}<=32} = {
+ def scanfn = merge{'si_scan_pluswrap_u',fmtnat{width{T}}}
+ p := *ty_u{eltype{type{ptr}}}~~ptr
+ emit{void, scanfn, p, p, len, js}; js=load{ptr,len-1}
+ }
+ def upd{rp, j, ij} = store{rp, ij, load{rp,ij}+inc{j}}
+ scan_core{upd, 0, scan, rp, wp, s}
+}
+
+fn ind_by_scan_i32{W}(xv:*void, rp:*i32, s:usz) : void = {
+ xp := *W~~xv
+ if (hasarch{'X86_64'} & ~hasarch{'SSE4.1'}) { # no min instruction
+ js:i32 = 0
+ indrep_by_sum{i32, rp, xp, s, js, {j}=>1}
+ } else {
+ scan_core{
+ {rp,j,ij} => store{rp,ij,cast_i{i32,j}}, 1,
+ {ptr,len} => emit{void, 'si_scan_max_i32', ptr,ptr,len},
+ rp, xp, s
+ }
+ }
+}
+
+def rep_by_scan{T, wp, xv:*void, rv:*void, s} = {
+ xp := *T~~xv; js := *xp; px := js
+ def inc{j} = {sx:=px; px=load{xp,j}; px-sx}
+ indrep_by_sum{T, *T~~rv, wp, s, js, inc}
+}
+fn rep_by_scan{W, T}(wp:*void, xv:*void, rv:*void, s:usz) : void = {
+ rep_by_scan{T, *W~~wp, xv, rv, s}
+}
+
+exportT{'si_indices_scan_i32', each{ind_by_scan_i32, ind_types}}
+exportT{'si_replicate_scan', flat_table{rep_by_scan, ind_types, dat_types}}
+
+
+# Constant replicate
+if (not (hasarch{'AVX2'} | hasarch{'AARCH64'})) {
+
+fn rep_const{T}(wv:u64, x:*void, r:*void, n:u64) : void = {
+ rep_by_scan{T, cast_i{usz,wv}, x, r, cast_i{usz, wv*n}}
+}
+
+} else {
+
def incl{a,b} = slice{iota{b+1},a}
# 1+˝∨`⌾⌽0=div|⌜range
@@ -220,6 +284,6 @@ fn rep_const{T}(wv:u64, x:*void, r:*void, n:u64) : void = {
}
}
-exportT{'simd_constrep', each{rep_const, tup{i8, i16, i32, u64}}}
+}
-} \ No newline at end of file
+exportT{'si_constrep', each{rep_const, dat_types}}
diff --git a/src/singeli/src/slash.singeli b/src/singeli/src/slash.singeli
index e1560dd4..e66b97fa 100644
--- a/src/singeli/src/slash.singeli
+++ b/src/singeli/src/slash.singeli
@@ -339,8 +339,8 @@ def pext_popc{x:T, m:T} = {
tup{pe, scal{w} - z}
}
-def pext_width {..._ & hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 2
-def thresh_bool{..._ & hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 32
+def pext_width {& hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 2
+def thresh_bool{& hasarch{'PCLMUL'} > hasarch{'AVX2'}} = 32
def pext_popc{x0:V, m0:V & hasarch{'PCLMUL'} & V==[2]u64} = {
def clmul{a, b} = zipLo{...@collect (j to 2) clmul{a,b,j}}
m := m0
@@ -359,9 +359,9 @@ def pext_popc{x0:V, m0:V & hasarch{'PCLMUL'} & V==[2]u64} = {
tup{x, @collect (j to 2) popc{extract{m0,j}}}
}
-def pext_width {..._ & hasarch{'BMI2'}} = 1
-def thresh_bool{..._ & hasarch{'BMI2'}} = 512
-def pext_popc{x:T, m:T & hasarch{'BMI2'} & T==u64} = tup{pext{x, m}, popc{m}}
+def pext_width {& fast_BMI2{}} = 1
+def thresh_bool{& fast_BMI2{}} = 512
+def pext_popc{x:T, m:T & fast_BMI2{} & T==u64} = tup{pext{x, m}, popc{m}}
fn compress_bool(w:*u64, x:*u64, r:*u64, n:u64) : void = {
cw:u64 = 0; # current word
diff --git a/src/singeli/src/transpose.singeli b/src/singeli/src/transpose.singeli
index 08dbc977..4fe11838 100644
--- a/src/singeli/src/transpose.singeli
+++ b/src/singeli/src/transpose.singeli
@@ -150,6 +150,14 @@ def transpose_with_kernel{T, k, kh, call_base, rp:*T, xp:*T, w, h, ws, hs} = {
}
}
+# Interleave n values of type T from x0 and x1 into r
+fn interleave{T}(r0:*void, x0:*void, x1:*void, n:u64) : void = {
+ rp := *T~~r0
+ @for (x0 in *T~~x0, x1 in *T~~x1 over i to n) {
+ store{rp, i*2, x0}; store{rp, i*2+1, x1}
+ }
+}
+
fn transpose{T, k, kh}(r0:*void, x0:*void, w:u64, h:u64, ws:u64, hs:u64) : void = {
# Scalar transpose defined in C
def ts = if (T==i8) 'i8' else if (T==i16) 'i16' else if (T==i32) 'i32' else 'i64'
@@ -160,7 +168,7 @@ fn transpose{T, k, kh}(r0:*void, x0:*void, w:u64, h:u64, ws:u64, hs:u64) : void
if (hasarch{'AVX2'} and w>=k and h>=k) {
transpose_with_kernel{T, k, kh, call_base, rp, xp, w, h, ws, hs}
} else {
- if (h==2 and h==hs) @for (x0 in xp, x1 in xp+ws over i to w) { store{rp, i*2, x0}; store{rp, i*2+1, x1} }
+ if (h==2 and h==hs) interleave{T}(r0, x0, *void~~(xp+ws), w)
else if (w==2 and w==ws) @for (r0 in rp, r1 in rp+hs over i to h) { r0 = load{xp, i*2}; r1 = load{xp, i*2+1} }
else call_base{rp, xp, w, h}
}
@@ -174,3 +182,5 @@ exportT{'simd_transpose', tup{
transpose{i32, 8},
transpose{i64, 4}
}}
+
+exportT{'si_interleave', each{interleave, tup{i8, i16, i32, i64}}}