summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordzaima <dzaimagit@gmail.com>2024-02-13 22:33:11 +0200
committerdzaima <dzaimagit@gmail.com>2024-02-13 22:34:16 +0200
commit1318026c4eb054c146ad64e84af01f2ccd9ef719 (patch)
tree5b309d154c2ea2208068e691a7c4816f38a80a83
parent3fc33382560905ec0fd7cd77701ba25c5e6cf6ef (diff)
a bunch of vfor
-rwxr-xr-xbuild/src/build.bqn1
-rw-r--r--src/builtins/arithd.c30
-rw-r--r--src/builtins/arithm.c22
-rw-r--r--src/builtins/cmp.c4
-rw-r--r--src/builtins/grade.h4
-rw-r--r--src/builtins/search.c2
-rw-r--r--src/builtins/sfns.c12
-rw-r--r--src/builtins/slash.c8
-rw-r--r--src/builtins/sort.c2
-rw-r--r--src/builtins/sysfn.c10
-rw-r--r--src/core/harr.c2
-rw-r--r--src/ffi.c14
-rw-r--r--src/h.h3
-rw-r--r--src/utils/mut.c54
14 files changed, 86 insertions, 82 deletions
diff --git a/build/src/build.bqn b/build/src/build.bqn
index 7f3d4c3c..cd2471c4 100755
--- a/build/src/build.bqn
+++ b/build/src/build.bqn
@@ -311,6 +311,7 @@ po ← { # parsed options
⟨"-Wno-parentheses"⟩ # gcc
⟨ # clang
"-Wno-microsoft-anon-tag", "-Wno-bitwise-instead-of-logical", "-Wno-unknown-warning-option"
+ "-Wno-pass-failed" # vfor
⟩ ∾ (¬wasm)/⟨"-mllvm", "--x86-cmov-converter=0"⟩ # thing that converts `cmov`s to branching sometimes (?), but we don't want that
args∾↩ 0⊑GetLibs@
diff --git a/src/builtins/arithd.c b/src/builtins/arithd.c
index d2a756ae..83a11a19 100644
--- a/src/builtins/arithd.c
+++ b/src/builtins/arithd.c
@@ -29,7 +29,7 @@ B leading_axis_arith(FC2 fc2, B w, B x, usz* wsh, usz* xsh, ur mr);
#else
static void base_andBytes(u8* r, u8* x, u64 repeatedMask, u64 numBytes) {
u64* x64 = (u64*)x; usz i;
- for (i = 0; i < numBytes/8; i++) ((u64*)r)[i] = x64[i] & repeatedMask;
+ vfor (i = 0; i < numBytes/8; i++) ((u64*)r)[i] = x64[i] & repeatedMask;
if (i*8 != numBytes) {
u64 v = x64[i]&repeatedMask;
for (usz j = 0; j < (numBytes&7); j++) r[i*8 + j] = v>>(j*8);
@@ -47,7 +47,7 @@ B shape_c2(B t, B w, B x);
// floordiv will return float result only on ¯2147483648÷¯1 or n÷0, but may not otherwise squeeze integer types; integer argument requirement may be relaxed in the future
// divint will return float result if there's a fractional result, or in overflow cases same as floordiv
// TODO overflow-checked Singeli code for exact integer divint, and maybe floordiv_AA
-#define DIVLOOP(RE, WE, EXPR) RE* rp; B r=m_##RE##arrc(&rp, w); usz ia=IA(w); WE* wp=WE##any_ptr(w); for(ux i=0; i<ia; i++) rp[i] = (EXPR);
+#define DIVLOOP(RE, WE, EXPR) RE* rp; B r=m_##RE##arrc(&rp, w); usz ia=IA(w); WE* wp=WE##any_ptr(w); vfor(ux i=0; i<ia; i++) rp[i] = (EXPR);
static B divint_AA(B w, B x) { // consumes both
w = toI32Any(w);
x = toI32Any(x); i32* xp = tyany_ptr(x);
@@ -133,7 +133,7 @@ static B modint_AS(B w, B xv) { return modint_AA(w, C2(shape, C1(fne, incG(w))
#define Ri16(A) i16* rp; r=m_i16arrc(&rp, A);
#define Ri32(A) i32* rp; r=m_i32arrc(&rp, A);
#define Rf64(A) f64* rp; r=m_f64arrc(&rp, A);
- #define DOF(EXPR,A,W,X) { for (usz i = 0; i < ia; i++) { f64 wv=W; f64 xv=X; rp[i]=EXPR; } }
+ #define DOF(EXPR,A,W,X) { vfor (usz i = 0; i < ia; i++) { f64 wv=W; f64 xv=X; rp[i]=EXPR; } }
#define DOI8(EXPR,A,W,X,BASE) { Ri8(A) for (usz i=0; i<ia; i++) { i16 wv=W; i16 xv=X; i16 rv=EXPR; if (RARE(rv!=( i8)rv)) { decG(r); goto BASE; } rp[i]=rv; } goto dec_ret; }
#define DOI16(EXPR,A,W,X,BASE) { Ri16(A) for (usz i=0; i<ia; i++) { i32 wv=W; i32 xv=X; i32 rv=EXPR; if (RARE(rv!=(i16)rv)) { decG(r); goto BASE; } rp[i]=rv; } goto dec_ret; }
#define DOI32(EXPR,A,W,X,BASE) { Ri32(A) for (usz i=0; i<ia; i++) { i64 wv=W; i64 xv=X; i64 rv=EXPR; if (RARE(rv!=(i32)rv)) { decG(r); goto BASE; } rp[i]=rv; } goto dec_ret; }
@@ -151,20 +151,20 @@ static B modint_AS(B w, B xv) { return modint_AA(w, C2(shape, C1(fne, incG(w))
if (xe<el_i32) { x=taga(cpyI32Arr(x)); xe=el_i32; } void* xp=tyany_ptr(x); \
Rf64(x); \
if (we==el_i32) { B w,x /*shadow*/; \
- if (xe==el_i32) { DECOR for (usz i = 0; i < ia; i++) { w.f=((i32*)wp)[i]; x.f=((i32*)xp)[i]; rp[i]=EXPR; } } \
- else { DECOR for (usz i = 0; i < ia; i++) { w.f=((i32*)wp)[i]; x.f=((f64*)xp)[i]; rp[i]=EXPR; } } \
+ if (xe==el_i32) { DECOR vfor (usz i = 0; i < ia; i++) { w.f=((i32*)wp)[i]; x.f=((i32*)xp)[i]; rp[i]=EXPR; } } \
+ else { DECOR vfor (usz i = 0; i < ia; i++) { w.f=((i32*)wp)[i]; x.f=((f64*)xp)[i]; rp[i]=EXPR; } } \
} else { B w,x /*shadow*/; \
- if (xe==el_i32) { DECOR for (usz i = 0; i < ia; i++) { w.f=((f64*)wp)[i]; x.f=((i32*)xp)[i]; rp[i]=EXPR; } } \
- else { DECOR for (usz i = 0; i < ia; i++) { w.f=((f64*)wp)[i]; x.f=((f64*)xp)[i]; rp[i]=EXPR; } } \
+ if (xe==el_i32) { DECOR vfor (usz i = 0; i < ia; i++) { w.f=((f64*)wp)[i]; x.f=((i32*)xp)[i]; rp[i]=EXPR; } } \
+ else { DECOR vfor (usz i = 0; i < ia; i++) { w.f=((f64*)wp)[i]; x.f=((f64*)xp)[i]; rp[i]=EXPR; } } \
} \
decG(w); decG(x); return num_squeeze(r); \
} \
} else if (isF64(w)&isArr(x)) { usz ia=IA(x); u8 xe=TI(x,elType); \
- if (elInt(xe)) {INT_SA Rf64(x); x=toI32Any(x); PI32(x) DECOR for (usz i=0; i<ia; i++) {B x/*shadow*/;x.f=xp[i];rp[i]=EXPR;} decG(x); return num_squeeze(r); } \
- if (xe==el_f64){ Rf64(x); PF(x) FLT_SAI DECOR for (usz i=0; i<ia; i++) {B x/*shadow*/;x.f=xp[i];rp[i]=EXPR;} decG(x); return num_squeeze(r); } \
+ if (elInt(xe)) {INT_SA Rf64(x); x=toI32Any(x); PI32(x) DECOR vfor (usz i=0; i<ia; i++) {B x/*shadow*/;x.f=xp[i];rp[i]=EXPR;} decG(x); return num_squeeze(r); } \
+ if (xe==el_f64){ Rf64(x); PF(x) FLT_SAI DECOR vfor (usz i=0; i<ia; i++) {B x/*shadow*/;x.f=xp[i];rp[i]=EXPR;} decG(x); return num_squeeze(r); } \
} else if (isF64(x)&isArr(w)) { usz ia=IA(w); u8 we=TI(w,elType); ANY_AS \
- if (elInt(we)) {INT_AS Rf64(w); w=toI32Any(w); PI32(w) DECOR for (usz i=0; i<ia; i++) {B w/*shadow*/;w.f=wp[i];rp[i]=EXPR;} decG(w); return num_squeeze(r); } \
- if (we==el_f64){ Rf64(w); PF(w) DECOR for (usz i=0; i<ia; i++) {B w/*shadow*/;w.f=wp[i];rp[i]=EXPR;} decG(w); return num_squeeze(r); } \
+ if (elInt(we)) {INT_AS Rf64(w); w=toI32Any(w); PI32(w) DECOR vfor (usz i=0; i<ia; i++) {B w/*shadow*/;w.f=wp[i];rp[i]=EXPR;} decG(w); return num_squeeze(r); } \
+ if (we==el_f64){ Rf64(w); PF(w) DECOR vfor (usz i=0; i<ia; i++) {B w/*shadow*/;w.f=wp[i];rp[i]=EXPR;} decG(w); return num_squeeze(r); } \
} \
P2(NAME) \
} \
@@ -214,7 +214,7 @@ static B modint_AS(B w, B xv) { return modint_AA(w, C2(shape, C1(fne, incG(w))
}
, /*INT_AS*/ if (q_i32(x)) return modint_AS(w, x);
, /*INT_AA*/ return modint_AA(w, x);
- , /*FLT_SAI*/ if (o2fG(w)==1) { for (usz i=0; i<ia; i++) rp[i] = xp[i]-floor(xp[i]); } else
+ , /*FLT_SAI*/ if (o2fG(w)==1) { vfor (usz i=0; i<ia; i++) rp[i] = xp[i]-floor(xp[i]); } else
, /*ANY_AS*/
)
#undef GC2f
@@ -271,13 +271,13 @@ static B modint_AS(B w, B xv) { return modint_AA(w, C2(shape, C1(fne, incG(w))
static NOINLINE B bitAA1(B w, B x, usz ia) {
u64* rp; B r = m_bitarrc(&rp, x);
u64* wp=bitarr_ptr(w); u64* xp=bitarr_ptr(x);
- for (usz i=0; i<BIT_N(ia); i++) rp[i] = wp[i]|xp[i];
+ vfor (usz i=0; i<BIT_N(ia); i++) rp[i] = wp[i]|xp[i];
decG(w); decG(x); return r;
}
static NOINLINE B bitAA2(B w, B x, usz ia) {
u64* rp; B r = m_bitarrc(&rp, x);
u64* wp=bitarr_ptr(w); u64* xp=bitarr_ptr(x);
- for (usz i=0; i<BIT_N(ia); i++) rp[i] = wp[i]&xp[i];
+ vfor (usz i=0; i<BIT_N(ia); i++) rp[i] = wp[i]&xp[i];
decG(w); decG(x); return r;
}
@@ -383,7 +383,7 @@ static B modint_AS(B w, B xv) { return modint_AA(w, C2(shape, C1(fne, incG(w))
if (we!=el_c32) w = taga(cpyC32Arr(w));
u32* wp = c32any_ptr(w); usz wia = IA(w);
i32* rp; r = m_i32arrc(&rp, w);
- for (usz i = 0; i < wia; i++) rp[i] = (i32)wp[i] - xv;
+ vfor (usz i = 0; i < wia; i++) rp[i] = (i32)wp[i] - xv;
goto dec_ret;
}
})
diff --git a/src/builtins/arithm.c b/src/builtins/arithm.c
index fdce468e..d2e9e162 100644
--- a/src/builtins/arithm.c
+++ b/src/builtins/arithm.c
@@ -11,7 +11,7 @@ static inline B arith_recm(FC1 f, B x) {
}
void bit_negatePtr(u64* rp, u64* xp, usz count) {
- for (usz i = 0; i < count; i++) rp[i] = ~xp[i];
+ vfor (usz i = 0; i < count; i++) rp[i] = ~xp[i];
}
B bit_negate(B x) { // consumes
u64* xp = bitarr_ptr(x);
@@ -48,10 +48,10 @@ B add_c1(B t, B x) {
i64 ia = IA(x); INIT; \
void* xp = tyany_ptr(x); \
switch(xe) { default: UD; \
- case el_i8: for(usz i=0; i<ia; i++) { i8 c = ((i8* )xp)[i]; EXPR(i8, c==I8_MIN) } break; \
- case el_i16: for(usz i=0; i<ia; i++) { i16 c = ((i16*)xp)[i]; EXPR(i16, c==I16_MIN) } break; \
- case el_i32: for(usz i=0; i<ia; i++) { i32 c = ((i32*)xp)[i]; EXPR(i32, c==I32_MIN) } break; \
- case el_f64: for(usz i=0; i<ia; i++) { f64 c = ((f64*)xp)[i]; EXPR(f64, 0) } break; \
+ case el_i8: for(usz i=0; i<ia; i++) { i8 c = ((i8* )xp)[i]; EXPR(i8, c==I8_MIN) } break; \
+ case el_i16: for(usz i=0; i<ia; i++) { i16 c = ((i16*)xp)[i]; EXPR(i16, c==I16_MIN) } break; \
+ case el_i32: for(usz i=0; i<ia; i++) { i32 c = ((i32*)xp)[i]; EXPR(i32, c==I32_MIN) } break; \
+ case el_f64: vfor(usz i=0; i<ia; i++) { f64 c = ((f64*)xp)[i]; EXPR(f64, 0) } break; \
} \
decG(x); return r; POST \
}
@@ -75,11 +75,11 @@ B add_c1(B t, B x) {
#define STILE_BODY(FEXPR) LOOP_BODY(B r; void* rp = m_tyarrlc(&r, elWidth(xe), x, el2t(xe));, STILE_EXPR, bad: tyarr_freeF(v(r));)
#endif
-#define FLOAT_BODY(FEXPR) { i64 ia = IA(x); \
- assert(xe==el_f64); f64* xp = f64any_ptr(x); \
- f64* rp; B r = m_f64arrc(&rp, x); \
- for (usz i = 0; i < ia; i++) { f64 v=xp[i]; rp[i]=FEXPR; } \
- decG(x); return num_squeeze(r); \
+#define FLOAT_BODY(FEXPR) { i64 ia = IA(x); \
+ assert(xe==el_f64); f64* xp = f64any_ptr(x); \
+ f64* rp; B r = m_f64arrc(&rp, x); \
+ vfor (usz i = 0; i < ia; i++) { f64 v=xp[i]; rp[i]=FEXPR; } \
+ decG(x); return num_squeeze(r); \
}
B sub_c2(B,B,B);
#define SUB_BODY(FEXPR) return sub_c2(t, m_f64(0), x);
@@ -101,7 +101,7 @@ GC1i("¬", not, 1-v, el_bit, bit_negate(x), NOT_BODY)
u64 ia = IA(x); \
f64* xp = f64any_ptr(x); \
f64* rp; B r = m_f64arrc(&rp, x); \
- for (i64 i = 0; i < ia; i++) { \
+ vfor (i64 i = 0; i < ia; i++) { \
f64 xv=xp[i]; rp[i] = (F); \
} \
decG(x); return r; \
diff --git a/src/builtins/cmp.c b/src/builtins/cmp.c
index 3fa9c370..bd8aa0dd 100644
--- a/src/builtins/cmp.c
+++ b/src/builtins/cmp.c
@@ -69,7 +69,7 @@ CMP_REC(ne, ne, swapped=0;)
#define CMP_AA0(N, T, BODY) void base_##N##AA##_##T(u64* r, void* w, void* x, u64 l) { BODY }
#define CMP_AA1(N, T, OP) CMP_AA0(N, T, BASE_CMP_LOOP(OP, ((T*)w)[i], ((T*)x)[i]))
#define CMP_AA_F(N, OP, BX) \
- CMP_AA0(N, u1, ({usz bia = BIT_N(l); for (usz i=0; i<bia; i++) { u64 wv=((u64*)w)[i], xv=((u64*)x)[i]; ((u64*)r)[i] = BX; }});) \
+ CMP_AA0(N, u1, ({usz bia = BIT_N(l); vfor (usz i=0; i<bia; i++) { u64 wv=((u64*)w)[i], xv=((u64*)x)[i]; ((u64*)r)[i] = BX; }});) \
CMP_AA1(N, i8, OP) CMP_AA1(N, i16, OP) CMP_AA1(N, i32, OP) CMP_AA1(N, f64, OP) \
const CmpAAFn base_##N##AA##_u32 = base_##N##AA##_i32;
@@ -104,7 +104,7 @@ CMP_REC(ne, ne, swapped=0;)
#define CMP_SA1(N, T, Q, C, SLOW, OP) CMP_SA0(N, T, Q, SLOW, ({ T xv = C(x); BASE_CMP_LOOP(OP, ((T*)w)[i], xv) }))
#define CMP_SA_F(N, OP, SLOW, BX) \
- CMP_SA0(N, u1, bit, SLOW, ({usz bia = BIT_N(l); u64 xv=bitx(x); for (usz i=0; i<bia; i++) { u64 wv=((u64*)w)[i]; ((u64*)r)[i] = BX; }})) \
+ CMP_SA0(N, u1, bit, SLOW, ({usz bia = BIT_N(l); u64 xv=bitx(x); vfor (usz i=0; i<bia; i++) { u64 wv=((u64*)w)[i]; ((u64*)r)[i] = BX; }})) \
CMP_SA1(N,i8,i8,o2iG,SLOW,OP) CMP_SA1(N,i16,i16,o2iG,SLOW,OP) CMP_SA1(N,i32,i32,o2iG,SLOW,OP) CMP_SA1(N,f64,f64,o2fG,SLOW,OP) \
CMP_SA1(N,u8,c8,o2cG,SLOW,OP) CMP_SA1(N,u16,c16,o2cG,SLOW,OP) CMP_SA1(N,u32,c32,o2cG,SLOW,OP)
diff --git a/src/builtins/grade.h b/src/builtins/grade.h
index 80d3555d..7890d80d 100644
--- a/src/builtins/grade.h
+++ b/src/builtins/grade.h
@@ -319,12 +319,12 @@ B GRADE_CAT(c1)(B t, B x) {
}
TALLOC(I32I32p, tmp, ia);
- for (usz i = 0; i < ia; i++) {
+ vfor (usz i = 0; i < ia; i++) {
tmp[i].v = i;
tmp[i].k = xp[i];
}
CAT(GRADE_CAT(IP),tim_sort)(tmp, ia);
- for (usz i = 0; i < ia; i++) rp[i] = tmp[i].v;
+ vfor (usz i = 0; i < ia; i++) rp[i] = tmp[i].v;
TFREE(tmp);
goto decG_sq;
}
diff --git a/src/builtins/search.c b/src/builtins/search.c
index 8ce7d8d1..49221225 100644
--- a/src/builtins/search.c
+++ b/src/builtins/search.c
@@ -574,7 +574,7 @@ B asNormalized(B x, usz n, bool nanBad) {
}
#endif
} else {
- for (; i < n; i++) rp[i] = normalizeFloat(fp[i]);
+ vfor (; i < n; i++) rp[i] = normalizeFloat(fp[i]);
}
if (r.u!=x.u) decG(x);
diff --git a/src/builtins/sfns.c b/src/builtins/sfns.c
index 1fa22c73..2d6e7e07 100644
--- a/src/builtins/sfns.c
+++ b/src/builtins/sfns.c
@@ -1066,18 +1066,18 @@ B reverse_c1(B t, B x) {
case 0: {
u64* rp; r = m_bitarrc(&rp, x);
u64* xp=xv; usz g = BIT_N(n); usz e = g-1;
- for (usz i = 0; i < g; i++) rp[i] = bit_reverse(xp[e-i]);
+ vfor (usz i = 0; i < g; i++) rp[i] = bit_reverse(xp[e-i]);
if (n&63) {
u64 sh=(-n)&63;
- for (usz i=0; i<e; i++) rp[i]=rp[i]>>sh|rp[i+1]<<(64-sh);
+ vfor (usz i=0; i<e; i++) rp[i]=rp[i]>>sh|rp[i+1]<<(64-sh);
rp[e]>>=sh;
}
break;
}
- case 3: { u8* xp=xv; u8* rp = m_tyarrc(&r, 1, x, xt); for (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; }
- case 4: { u16* xp=xv; u16* rp = m_tyarrc(&r, 2, x, xt); for (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; }
- case 5: { u32* xp=xv; u32* rp = m_tyarrc(&r, 4, x, xt); for (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; }
- case 6: if (TI(x,elType)!=el_B) { u64* xp=xv; u64* rp = m_tyarrc(&r, 8, x, xt); for (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; }
+ case 3: { u8* xp=xv; u8* rp = m_tyarrc(&r, 1, x, xt); vfor (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; }
+ case 4: { u16* xp=xv; u16* rp = m_tyarrc(&r, 2, x, xt); vfor (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; }
+ case 5: { u32* xp=xv; u32* rp = m_tyarrc(&r, 4, x, xt); vfor (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; }
+ case 6: if (TI(x,elType)!=el_B) { u64* xp=xv; u64* rp = m_tyarrc(&r, 8, x, xt); vfor (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; }
else {
HArr_p rp = m_harrUc(x);
B* xp = arr_bptr(x);
diff --git a/src/builtins/slash.c b/src/builtins/slash.c
index 310b6f51..b2555a37 100644
--- a/src/builtins/slash.c
+++ b/src/builtins/slash.c
@@ -299,7 +299,7 @@ static B where(B x, usz xia, u64 s) {
usz bs;
if (b>xia-i) { b=xia-i; bs=s-(rp-rp0); } else { bs=bit_sum(xp,b); }
where_block_u16(xp, buf, b, bs);
- for (usz j=0; j<bs; j++) rp[j] = i+buf[j];
+ vfor (usz j=0; j<bs; j++) rp[j] = i+buf[j];
rp+= bs;
xp+= b/64;
}
@@ -380,7 +380,7 @@ B grade_bool(B x, usz xia, bool up) {
u64 xp0[4]; // 4 ≡ b/64
u64* xp1 = xp;
for (usz i=0; i<xia; i+=b) {
- for (usz j=0; j<BIT_N(b); j++) xp0[j] = ~xp1[j];
+ vfor (usz j=0; j<BIT_N(b); j++) xp0[j] = ~xp1[j];
usz b2 = b>xia-i? xia-i : b;
if (b2<b) { u64 q=b2%64; usz e=b2/64; u64 m=((u64)1<<q)-1; xp0[e]&=m; xp1[e]&=m; }
usz s0=bit_sum(xp0,b2); si_1slash32(xp0, i, rp0, b2, s0); rp0+=s0;
@@ -888,7 +888,7 @@ B slash_im(B t, B x) {
for (usz i=0; i<xia; i++) t[(u##N)xp[i]]++; \
t[m/2]=xia; usz ria=0; for (u64 s=0; s<xia; ria++) s+=t[ria]; \
if (ria>m/2) thrM("/⁼: Argument cannot contain negative numbers"); \
- i32* rp; r = m_i32arrv(&rp, ria); for (usz i=0; i<ria; i++) rp[i]=t[i]; \
+ i32* rp; r = m_i32arrv(&rp, ria); vfor (usz i=0; i<ria; i++) rp[i]=t[i]; \
TFREE(t); \
r = num_squeeze(r); \
} \
@@ -901,7 +901,7 @@ B slash_im(B t, B x) {
i8 max = avx2_count_i8(t, (i8*)xp, xia, 0); \
if (max < 0) thrM("/⁼: Argument cannot contain negative numbers"); \
usz ria=max+1; \
- i32* rp; r = m_i32arrv(&rp, ria); for (usz i=0; i<ria; i++) rp[i]=t[i]; \
+ i32* rp; r = m_i32arrv(&rp, ria); vfor (usz i=0; i<ria; i++) rp[i]=t[i]; \
TFREE(t); \
r = num_squeeze(r); \
} else
diff --git a/src/builtins/sort.c b/src/builtins/sort.c
index e803ba21..8492aac7 100644
--- a/src/builtins/sort.c
+++ b/src/builtins/sort.c
@@ -21,7 +21,7 @@ static NOINLINE void generic_grade(B x, usz ia, B r, i32* rp, void (*fn)(BI32p*,
tmp[i].k = GetU(x,i);
}
fn(tmp, ia);
- for (usz i = 0; i < ia; i++) rp[i] = tmp[i].v;
+ vfor (usz i = 0; i < ia; i++) rp[i] = tmp[i].v;
TFREE(tmp);
}
diff --git a/src/builtins/sysfn.c b/src/builtins/sysfn.c
index 81e9675e..d585773a 100644
--- a/src/builtins/sysfn.c
+++ b/src/builtins/sysfn.c
@@ -1566,7 +1566,7 @@ B bitop1(B f, B x, enum BitOp1 op, char* name) {
case op_neg: switch (ow) {
default: thrF("•bit._%U: unhandled width %s", name, ow);
#define CASE(W) case W: \
- NOUNROLL for (usz i=0; i<n/W; i++) ((u##W*)rp)[i] = -((u##W*)xp)[i]; \
+ NOUNROLL vfor (usz i=0; i<n/W; i++) ((u##W*)rp)[i] = -((u##W*)xp)[i]; \
break;
CASE(8) CASE(16) CASE(32) CASE(64)
#undef CASE
@@ -1643,11 +1643,11 @@ B bitop2(B f, B w, B x, enum BitOp2 op, char* name) {
}
if (noextend) {
#define BINOP(O,P) case op_##O: { \
- usz l = n/64; NOUNROLL for (usz i=0; i<l; i++) rp[i] = wp[i] P xp[i]; \
+ usz l = n/64; NOUNROLL vfor (usz i=0; i<l; i++) rp[i] = wp[i] P xp[i]; \
usz q = (-n)%64; if (q) rp[l] ^= (~(u64)0 >> q) & (rp[l]^(wp[l] P xp[l])); \
} break;
#define CASE(W, Q, P) case W: \
- NOUNROLL for (usz i=0; i<n/W; i++) \
+ NOUNROLL vfor (usz i=0; i<n/W; i++) \
((Q##W*)rp)[i] = ((Q##W*)wp)[i] P ((Q##W*)xp)[i]; \
break;
SWITCH
@@ -1659,12 +1659,12 @@ B bitop2(B f, B w, B x, enum BitOp2 op, char* name) {
if (ow>64) thrF("•bit._%U: scalar extension with width over 64 unhandled", name); \
u64 wv = *wp & (~(u64)0>>(64-ow)); \
for (usz tw=ow; tw<64; tw*=2) wv|=wv<<tw; \
- usz l = n/64; NOUNROLL for (usz i=0; i<l; i++) rp[i] = wv P xp[i]; \
+ usz l = n/64; NOUNROLL vfor (usz i=0; i<l; i++) rp[i] = wv P xp[i]; \
usz q = (-n)%64; if (q) rp[l] ^= (~(u64)0 >> q) & (rp[l]^(wv P xp[l])); \
} break;
#define CASE(W, Q, P) case W: { \
Q##W wv = *(Q##W*)wp; \
- NOUNROLL for (usz i=0; i<n/W; i++) \
+ NOUNROLL vfor (usz i=0; i<n/W; i++) \
((Q##W*)rp)[i] = wv P ((Q##W*)xp)[i]; \
} break;
SWITCH
diff --git a/src/core/harr.c b/src/core/harr.c
index 9db881da..1b00a10a 100644
--- a/src/core/harr.c
+++ b/src/core/harr.c
@@ -4,7 +4,7 @@
NOINLINE B m_caB(usz ia, B* a) {
HArr_p r = m_harrUv(ia);
- for (usz i = 0; i < ia; i++) r.a[i] = a[i];
+ vfor (usz i = 0; i < ia; i++) r.a[i] = a[i];
NOGC_E;
return r.b;
}
diff --git a/src/ffi.c b/src/ffi.c
index 041d499b..7e6fcf2f 100644
--- a/src/ffi.c
+++ b/src/ffi.c
@@ -78,7 +78,7 @@ BQN_EXP size_t bqn_rank(BQNV a) { return RNK(getB(a)); }
BQN_EXP void bqn_shape(BQNV a, size_t* buf) { B b = getB(a);
ur r = RNK(b);
usz* sh = SH(b);
- for (usz i = 0; i < r; i++) buf[i] = sh[i];
+ vfor (usz i = 0; i < r; i++) buf[i] = sh[i];
}
BQN_EXP BQNV bqn_pick(BQNV a, size_t pos) {
return makeX(IGet(getB(a),pos));
@@ -590,7 +590,7 @@ FORCE_INLINE u64 i64abs(i64 x) { return x<0?-x:x; }
usz ia = IA(x); \
B t = WIDEN(x); WEL* tp = WEL##any_ptr(t); \
REL* rp; B r = m_##REL##arrv(&rp, ia); \
- for (usz i=0; i<ia; i++) ((UEL*)rp)[i] = tp[i]; \
+ vfor (usz i=0; i<ia; i++) ((UEL*)rp)[i] = tp[i];\
decG(t); return r;
// copy elements of x to array of unsigned integers (using a signed integer array type as a "container"); consumes argument
@@ -603,7 +603,7 @@ NOINLINE B cpyF32Bits(B x) { // copy x to a 32-bit float array (using an i32arr
usz ia = IA(x);
B t = toF64Any(x); f64* tp = f64any_ptr(t);
i32* rp; B r = m_i32arrv(&rp, ia);
- for (usz i=0; i<ia; i++) ((f32*)rp)[i]=tp[i];
+ vfor (usz i=0; i<ia; i++) ((f32*)rp)[i]=tp[i];
dec(t); return r;
}
@@ -613,10 +613,10 @@ static B toU16Bits(B x) { return TI(x,elType)==el_i16? x : cpyU16Bits(x); }
static B toU8Bits(B x) { return TI(x,elType)==el_i8? x : cpyU8Bits(x); }
// read x as the specified type (assuming a container of the respective width signed integer array); consumes x
-NOINLINE B readU8Bits(B x) { usz ia=IA(x); u8* xp=tyarr_ptr(x); i16* rp; B r=m_i16arrv(&rp, ia); for (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); }
-NOINLINE B readU16Bits(B x) { usz ia=IA(x); u16* xp=tyarr_ptr(x); i32* rp; B r=m_i32arrv(&rp, ia); for (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); }
-NOINLINE B readU32Bits(B x) { usz ia=IA(x); u32* xp=tyarr_ptr(x); f64* rp; B r=m_f64arrv(&rp, ia); for (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); }
-NOINLINE B readF32Bits(B x) { usz ia=IA(x); f32* xp=tyarr_ptr(x); f64* rp; B r=m_f64arrv(&rp, ia); for (usz i=0; i<ia; i++) rp[i]=xp[i]; return r; }
+NOINLINE B readU8Bits(B x) { usz ia=IA(x); u8* xp=tyarr_ptr(x); i16* rp; B r=m_i16arrv(&rp, ia); vfor (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); }
+NOINLINE B readU16Bits(B x) { usz ia=IA(x); u16* xp=tyarr_ptr(x); i32* rp; B r=m_i32arrv(&rp, ia); vfor (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); }
+NOINLINE B readU32Bits(B x) { usz ia=IA(x); u32* xp=tyarr_ptr(x); f64* rp; B r=m_f64arrv(&rp, ia); vfor (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); }
+NOINLINE B readF32Bits(B x) { usz ia=IA(x); f32* xp=tyarr_ptr(x); f64* rp; B r=m_f64arrv(&rp, ia); vfor (usz i=0; i<ia; i++) rp[i]=xp[i]; return r; }
B m_ptrobj_s(void* ptr, B o); // consumes o, sets stride to size of o
B m_ptrobj(void* ptr, B o, ux stride); // consumes o
static NOINLINE B ptrobj_checkget(B x); // doesn't consume
diff --git a/src/h.h b/src/h.h
index 2158d346..c975e660 100644
--- a/src/h.h
+++ b/src/h.h
@@ -123,13 +123,16 @@ typedef size_t ux;
#if __clang__
#define NOUNROLL _Pragma("clang loop unroll(disable)")
#define NOVECTORIZE _Pragma("clang loop vectorize(disable)")
+ #define vfor _Pragma("clang loop vectorize(assume_safety)") for
#elif __GNUC__
#define EXACTLY_GCC 1
#define NOUNROLL _Pragma("GCC unroll 1")
+ #define vfor _Pragma("GCC ivdep") for
#define NOVECTORIZE
#else
#define NOUNROLL
#define NOVECTORIZE
+ #define vfor for
#endif
#define PLAINLOOP NOUNROLL NOVECTORIZE
#if EXACTLY_GCC
diff --git a/src/utils/mut.c b/src/utils/mut.c
index 4a538a08..1deea741 100644
--- a/src/utils/mut.c
+++ b/src/utils/mut.c
@@ -289,12 +289,12 @@ DEF_G(void, copy, B, (void* a, usz ms, B x, usz xs, usz l), ms, x, x
B* mpo = ms+(B*)a;
switch(TY(x)) {
case t_bitarr: { u64* xp = bitarr_ptr(x); for (usz i = 0; i < l; i++) mpo[i] = m_i32(bitp_get(xp, xs+i)); return; }
- case t_i8arr: case t_i8slice: { i8* xp = i8any_ptr (x); for (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; }
- case t_i16arr: case t_i16slice: { i16* xp = i16any_ptr(x); for (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; }
- case t_i32arr: case t_i32slice: { i32* xp = i32any_ptr(x); for (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; }
- case t_c8arr: case t_c8slice: { u8* xp = c8any_ptr (x); for (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; }
- case t_c16arr: case t_c16slice: { u16* xp = c16any_ptr(x); for (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; }
- case t_c32arr: case t_c32slice: { u32* xp = c32any_ptr(x); for (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; }
+ case t_i8arr: case t_i8slice: { i8* xp = i8any_ptr (x); vfor (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; }
+ case t_i16arr: case t_i16slice: { i16* xp = i16any_ptr(x); vfor (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; }
+ case t_i32arr: case t_i32slice: { i32* xp = i32any_ptr(x); vfor (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; }
+ case t_c8arr: case t_c8slice: { u8* xp = c8any_ptr (x); vfor (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; }
+ case t_c16arr: case t_c16slice: { u16* xp = c16any_ptr(x); vfor (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; }
+ case t_c32arr: case t_c32slice: { u32* xp = c32any_ptr(x); vfor (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; }
case t_harr: case t_hslice: case t_fillarr: case t_fillslice:;
B* xp = arr_bptr(x)+xs;
for (usz i = 0; i < l; i++) inc(xp[i]);
@@ -393,15 +393,15 @@ DEF_G(void, copy, B, (void* a, usz ms, B x, usz xs, usz l), ms, x, x
E* rp; Arr* r = m_##E##arrp(&rp, ia); \
arr_shCopy(r, x); \
u8 xe = TI(x,elType); \
- if (xe==el_bit) { u64* xp = bitarr_ptr(x); for(usz i=0; i<ia; i++) rp[i]=bitp_get(xp,i); } \
- else if (xe==el_i8 ) { i8* xp = i8any_ptr (x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
- else if (xe==el_i16) { i16* xp = i16any_ptr(x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
- else if (xe==el_i32) { i32* xp = i32any_ptr(x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
- else if (xe==el_f64) { f64* xp = f64any_ptr(x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
+ if (xe==el_bit) { u64* xp = bitarr_ptr(x); for(usz i=0; i<ia; i++) rp[i]=bitp_get(xp,i); } \
+ else if (xe==el_i8 ) { i8* xp = i8any_ptr (x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
+ else if (xe==el_i16) { i16* xp = i16any_ptr(x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
+ else if (xe==el_i32) { i32* xp = i32any_ptr(x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
+ else if (xe==el_f64) { f64* xp = f64any_ptr(x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
else { \
B* xp = arr_bptr(x); \
- if (xp!=NULL) { for (usz i=0; i<ia; i++) rp[i]=o2fG(xp[i] ); } \
- else { SGetU(x) for (usz i=0; i<ia; i++) rp[i]=o2fG(GetU(x,i)); } \
+ if (xp!=NULL) { vfor (usz i=0; i<ia; i++) rp[i]=o2fG(xp[i] ); } \
+ else { SGetU(x) for (usz i=0; i<ia; i++) rp[i]=o2fG(GetU(x,i)); } \
} \
ptr_decT(a(x)); \
return r; \
@@ -413,13 +413,13 @@ DEF_G(void, copy, B, (void* a, usz ms, B x, usz xs, usz l), ms, x, x
T##Atom* rp; Arr* r = m_##E##arrp(&rp, ia); \
arr_shCopy(r, x); \
u8 xe = TI(x,elType); \
- if (xe==el_c8 ) { u8* xp = c8any_ptr (x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
- else if (xe==el_c16) { u16* xp = c16any_ptr(x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
- else if (xe==el_c32) { u32* xp = c32any_ptr(x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
+ if (xe==el_c8 ) { u8* xp = c8any_ptr (x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
+ else if (xe==el_c16) { u16* xp = c16any_ptr(x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
+ else if (xe==el_c32) { u32* xp = c32any_ptr(x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \
else { \
B* xp = arr_bptr(x); \
- if (xp!=NULL) { for (usz i=0; i<ia; i++) rp[i]=o2cG(xp[i] ); } \
- else { SGetU(x) for (usz i=0; i<ia; i++) rp[i]=o2cG(GetU(x,i)); } \
+ if (xp!=NULL) { vfor (usz i=0; i<ia; i++) rp[i]=o2cG(xp[i] ); } \
+ else { SGetU(x) for (usz i=0; i<ia; i++) rp[i]=o2cG(GetU(x,i)); } \
} \
ptr_decT(a(x)); \
return r; \
@@ -429,14 +429,14 @@ DEF_G(void, copy, B, (void* a, usz ms, B x, usz xs, usz l), ms, x, x
usz ia = IA(x);
HArr_p r = m_harrUc(x);
u8 xe = TI(x,elType);
- if (xe==el_bit) { u64* xp = bitarr_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(bitp_get(xp, i)); }
- else if (xe==el_i8 ) { i8* xp = i8any_ptr (x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); }
- else if (xe==el_i16) { i16* xp = i16any_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); }
- else if (xe==el_i32) { i32* xp = i32any_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); }
- else if (xe==el_f64) { f64* xp = f64any_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); }
- else if (xe==el_c8 ) { u8* xp = c8any_ptr (x); for(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); }
- else if (xe==el_c16) { u16* xp = c16any_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); }
- else if (xe==el_c32) { u32* xp = c32any_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); }
+ if (xe==el_bit) { u64* xp = bitarr_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(bitp_get(xp, i)); }
+ else if (xe==el_i8 ) { i8* xp = i8any_ptr (x); vfor(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); }
+ else if (xe==el_i16) { i16* xp = i16any_ptr(x); vfor(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); }
+ else if (xe==el_i32) { i32* xp = i32any_ptr(x); vfor(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); }
+ else if (xe==el_f64) { f64* xp = f64any_ptr(x); vfor(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); }
+ else if (xe==el_c8 ) { u8* xp = c8any_ptr (x); vfor(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); }
+ else if (xe==el_c16) { u16* xp = c16any_ptr(x); vfor(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); }
+ else if (xe==el_c32) { u32* xp = c32any_ptr(x); vfor(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); }
else {
B* xp = arr_bptr(x);
if (xp!=NULL) { for (usz i=0; i<ia; i++) r.a[i] = inc(xp[i]); }
@@ -451,7 +451,7 @@ DEF_G(void, copy, B, (void* a, usz ms, B x, usz xs, usz l), ms, x, x
u64* rp; Arr* r = m_bitarrp(&rp, ia);
arr_shCopy(r, x);
u8 xe = TI(x,elType);
- if (xe==el_bit) { u64* xp = bitarr_ptr(x); for(usz i=0; i<BIT_N(ia); i++) rp[i] = xp[i]; }
+ if (xe==el_bit) { u64* xp = bitarr_ptr(x); vfor(usz i=0; i<BIT_N(ia); i++) rp[i] = xp[i]; }
else if (xe==el_i8 ) { i8* xp = i8any_ptr (x); for(usz i=0; i<ia; i++) bitp_set(rp,i,xp[i]); }
else if (xe==el_i16) { i16* xp = i16any_ptr(x); for(usz i=0; i<ia; i++) bitp_set(rp,i,xp[i]); }
else if (xe==el_i32) { i32* xp = i32any_ptr(x); for(usz i=0; i<ia; i++) bitp_set(rp,i,xp[i]); }