diff options
author | dzaima <dzaimagit@gmail.com> | 2024-02-13 22:33:11 +0200 |
---|---|---|
committer | dzaima <dzaimagit@gmail.com> | 2024-02-13 22:34:16 +0200 |
commit | 1318026c4eb054c146ad64e84af01f2ccd9ef719 (patch) | |
tree | 5b309d154c2ea2208068e691a7c4816f38a80a83 | |
parent | 3fc33382560905ec0fd7cd77701ba25c5e6cf6ef (diff) |
a bunch of vfor
-rwxr-xr-x | build/src/build.bqn | 1 | ||||
-rw-r--r-- | src/builtins/arithd.c | 30 | ||||
-rw-r--r-- | src/builtins/arithm.c | 22 | ||||
-rw-r--r-- | src/builtins/cmp.c | 4 | ||||
-rw-r--r-- | src/builtins/grade.h | 4 | ||||
-rw-r--r-- | src/builtins/search.c | 2 | ||||
-rw-r--r-- | src/builtins/sfns.c | 12 | ||||
-rw-r--r-- | src/builtins/slash.c | 8 | ||||
-rw-r--r-- | src/builtins/sort.c | 2 | ||||
-rw-r--r-- | src/builtins/sysfn.c | 10 | ||||
-rw-r--r-- | src/core/harr.c | 2 | ||||
-rw-r--r-- | src/ffi.c | 14 | ||||
-rw-r--r-- | src/h.h | 3 | ||||
-rw-r--r-- | src/utils/mut.c | 54 |
14 files changed, 86 insertions, 82 deletions
diff --git a/build/src/build.bqn b/build/src/build.bqn index 7f3d4c3c..cd2471c4 100755 --- a/build/src/build.bqn +++ b/build/src/build.bqn @@ -311,6 +311,7 @@ po ← { # parsed options ⟨"-Wno-parentheses"⟩ # gcc ⟨ # clang "-Wno-microsoft-anon-tag", "-Wno-bitwise-instead-of-logical", "-Wno-unknown-warning-option" + "-Wno-pass-failed" # vfor ⟩ ∾ (¬wasm)/⟨"-mllvm", "--x86-cmov-converter=0"⟩ # thing that converts `cmov`s to branching sometimes (?), but we don't want that ⟩ args∾↩ 0⊑GetLibs@ diff --git a/src/builtins/arithd.c b/src/builtins/arithd.c index d2a756ae..83a11a19 100644 --- a/src/builtins/arithd.c +++ b/src/builtins/arithd.c @@ -29,7 +29,7 @@ B leading_axis_arith(FC2 fc2, B w, B x, usz* wsh, usz* xsh, ur mr); #else static void base_andBytes(u8* r, u8* x, u64 repeatedMask, u64 numBytes) { u64* x64 = (u64*)x; usz i; - for (i = 0; i < numBytes/8; i++) ((u64*)r)[i] = x64[i] & repeatedMask; + vfor (i = 0; i < numBytes/8; i++) ((u64*)r)[i] = x64[i] & repeatedMask; if (i*8 != numBytes) { u64 v = x64[i]&repeatedMask; for (usz j = 0; j < (numBytes&7); j++) r[i*8 + j] = v>>(j*8); @@ -47,7 +47,7 @@ B shape_c2(B t, B w, B x); // floordiv will return float result only on ¯2147483648÷¯1 or n÷0, but may not otherwise squeeze integer types; integer argument requirement may be relaxed in the future // divint will return float result if there's a fractional result, or in overflow cases same as floordiv // TODO overflow-checked Singeli code for exact integer divint, and maybe floordiv_AA -#define DIVLOOP(RE, WE, EXPR) RE* rp; B r=m_##RE##arrc(&rp, w); usz ia=IA(w); WE* wp=WE##any_ptr(w); for(ux i=0; i<ia; i++) rp[i] = (EXPR); +#define DIVLOOP(RE, WE, EXPR) RE* rp; B r=m_##RE##arrc(&rp, w); usz ia=IA(w); WE* wp=WE##any_ptr(w); vfor(ux i=0; i<ia; i++) rp[i] = (EXPR); static B divint_AA(B w, B x) { // consumes both w = toI32Any(w); x = toI32Any(x); i32* xp = tyany_ptr(x); @@ -133,7 +133,7 @@ static B modint_AS(B w, B xv) { return modint_AA(w, C2(shape, C1(fne, incG(w)) #define Ri16(A) i16* rp; r=m_i16arrc(&rp, A); #define Ri32(A) i32* rp; r=m_i32arrc(&rp, A); #define Rf64(A) f64* rp; r=m_f64arrc(&rp, A); - #define DOF(EXPR,A,W,X) { for (usz i = 0; i < ia; i++) { f64 wv=W; f64 xv=X; rp[i]=EXPR; } } + #define DOF(EXPR,A,W,X) { vfor (usz i = 0; i < ia; i++) { f64 wv=W; f64 xv=X; rp[i]=EXPR; } } #define DOI8(EXPR,A,W,X,BASE) { Ri8(A) for (usz i=0; i<ia; i++) { i16 wv=W; i16 xv=X; i16 rv=EXPR; if (RARE(rv!=( i8)rv)) { decG(r); goto BASE; } rp[i]=rv; } goto dec_ret; } #define DOI16(EXPR,A,W,X,BASE) { Ri16(A) for (usz i=0; i<ia; i++) { i32 wv=W; i32 xv=X; i32 rv=EXPR; if (RARE(rv!=(i16)rv)) { decG(r); goto BASE; } rp[i]=rv; } goto dec_ret; } #define DOI32(EXPR,A,W,X,BASE) { Ri32(A) for (usz i=0; i<ia; i++) { i64 wv=W; i64 xv=X; i64 rv=EXPR; if (RARE(rv!=(i32)rv)) { decG(r); goto BASE; } rp[i]=rv; } goto dec_ret; } @@ -151,20 +151,20 @@ static B modint_AS(B w, B xv) { return modint_AA(w, C2(shape, C1(fne, incG(w)) if (xe<el_i32) { x=taga(cpyI32Arr(x)); xe=el_i32; } void* xp=tyany_ptr(x); \ Rf64(x); \ if (we==el_i32) { B w,x /*shadow*/; \ - if (xe==el_i32) { DECOR for (usz i = 0; i < ia; i++) { w.f=((i32*)wp)[i]; x.f=((i32*)xp)[i]; rp[i]=EXPR; } } \ - else { DECOR for (usz i = 0; i < ia; i++) { w.f=((i32*)wp)[i]; x.f=((f64*)xp)[i]; rp[i]=EXPR; } } \ + if (xe==el_i32) { DECOR vfor (usz i = 0; i < ia; i++) { w.f=((i32*)wp)[i]; x.f=((i32*)xp)[i]; rp[i]=EXPR; } } \ + else { DECOR vfor (usz i = 0; i < ia; i++) { w.f=((i32*)wp)[i]; x.f=((f64*)xp)[i]; rp[i]=EXPR; } } \ } else { B w,x /*shadow*/; \ - if (xe==el_i32) { DECOR for (usz i = 0; i < ia; i++) { w.f=((f64*)wp)[i]; x.f=((i32*)xp)[i]; rp[i]=EXPR; } } \ - else { DECOR for (usz i = 0; i < ia; i++) { w.f=((f64*)wp)[i]; x.f=((f64*)xp)[i]; rp[i]=EXPR; } } \ + if (xe==el_i32) { DECOR vfor (usz i = 0; i < ia; i++) { w.f=((f64*)wp)[i]; x.f=((i32*)xp)[i]; rp[i]=EXPR; } } \ + else { DECOR vfor (usz i = 0; i < ia; i++) { w.f=((f64*)wp)[i]; x.f=((f64*)xp)[i]; rp[i]=EXPR; } } \ } \ decG(w); decG(x); return num_squeeze(r); \ } \ } else if (isF64(w)&isArr(x)) { usz ia=IA(x); u8 xe=TI(x,elType); \ - if (elInt(xe)) {INT_SA Rf64(x); x=toI32Any(x); PI32(x) DECOR for (usz i=0; i<ia; i++) {B x/*shadow*/;x.f=xp[i];rp[i]=EXPR;} decG(x); return num_squeeze(r); } \ - if (xe==el_f64){ Rf64(x); PF(x) FLT_SAI DECOR for (usz i=0; i<ia; i++) {B x/*shadow*/;x.f=xp[i];rp[i]=EXPR;} decG(x); return num_squeeze(r); } \ + if (elInt(xe)) {INT_SA Rf64(x); x=toI32Any(x); PI32(x) DECOR vfor (usz i=0; i<ia; i++) {B x/*shadow*/;x.f=xp[i];rp[i]=EXPR;} decG(x); return num_squeeze(r); } \ + if (xe==el_f64){ Rf64(x); PF(x) FLT_SAI DECOR vfor (usz i=0; i<ia; i++) {B x/*shadow*/;x.f=xp[i];rp[i]=EXPR;} decG(x); return num_squeeze(r); } \ } else if (isF64(x)&isArr(w)) { usz ia=IA(w); u8 we=TI(w,elType); ANY_AS \ - if (elInt(we)) {INT_AS Rf64(w); w=toI32Any(w); PI32(w) DECOR for (usz i=0; i<ia; i++) {B w/*shadow*/;w.f=wp[i];rp[i]=EXPR;} decG(w); return num_squeeze(r); } \ - if (we==el_f64){ Rf64(w); PF(w) DECOR for (usz i=0; i<ia; i++) {B w/*shadow*/;w.f=wp[i];rp[i]=EXPR;} decG(w); return num_squeeze(r); } \ + if (elInt(we)) {INT_AS Rf64(w); w=toI32Any(w); PI32(w) DECOR vfor (usz i=0; i<ia; i++) {B w/*shadow*/;w.f=wp[i];rp[i]=EXPR;} decG(w); return num_squeeze(r); } \ + if (we==el_f64){ Rf64(w); PF(w) DECOR vfor (usz i=0; i<ia; i++) {B w/*shadow*/;w.f=wp[i];rp[i]=EXPR;} decG(w); return num_squeeze(r); } \ } \ P2(NAME) \ } \ @@ -214,7 +214,7 @@ static B modint_AS(B w, B xv) { return modint_AA(w, C2(shape, C1(fne, incG(w)) } , /*INT_AS*/ if (q_i32(x)) return modint_AS(w, x); , /*INT_AA*/ return modint_AA(w, x); - , /*FLT_SAI*/ if (o2fG(w)==1) { for (usz i=0; i<ia; i++) rp[i] = xp[i]-floor(xp[i]); } else + , /*FLT_SAI*/ if (o2fG(w)==1) { vfor (usz i=0; i<ia; i++) rp[i] = xp[i]-floor(xp[i]); } else , /*ANY_AS*/ ) #undef GC2f @@ -271,13 +271,13 @@ static B modint_AS(B w, B xv) { return modint_AA(w, C2(shape, C1(fne, incG(w)) static NOINLINE B bitAA1(B w, B x, usz ia) { u64* rp; B r = m_bitarrc(&rp, x); u64* wp=bitarr_ptr(w); u64* xp=bitarr_ptr(x); - for (usz i=0; i<BIT_N(ia); i++) rp[i] = wp[i]|xp[i]; + vfor (usz i=0; i<BIT_N(ia); i++) rp[i] = wp[i]|xp[i]; decG(w); decG(x); return r; } static NOINLINE B bitAA2(B w, B x, usz ia) { u64* rp; B r = m_bitarrc(&rp, x); u64* wp=bitarr_ptr(w); u64* xp=bitarr_ptr(x); - for (usz i=0; i<BIT_N(ia); i++) rp[i] = wp[i]&xp[i]; + vfor (usz i=0; i<BIT_N(ia); i++) rp[i] = wp[i]&xp[i]; decG(w); decG(x); return r; } @@ -383,7 +383,7 @@ static B modint_AS(B w, B xv) { return modint_AA(w, C2(shape, C1(fne, incG(w)) if (we!=el_c32) w = taga(cpyC32Arr(w)); u32* wp = c32any_ptr(w); usz wia = IA(w); i32* rp; r = m_i32arrc(&rp, w); - for (usz i = 0; i < wia; i++) rp[i] = (i32)wp[i] - xv; + vfor (usz i = 0; i < wia; i++) rp[i] = (i32)wp[i] - xv; goto dec_ret; } }) diff --git a/src/builtins/arithm.c b/src/builtins/arithm.c index fdce468e..d2e9e162 100644 --- a/src/builtins/arithm.c +++ b/src/builtins/arithm.c @@ -11,7 +11,7 @@ static inline B arith_recm(FC1 f, B x) { } void bit_negatePtr(u64* rp, u64* xp, usz count) { - for (usz i = 0; i < count; i++) rp[i] = ~xp[i]; + vfor (usz i = 0; i < count; i++) rp[i] = ~xp[i]; } B bit_negate(B x) { // consumes u64* xp = bitarr_ptr(x); @@ -48,10 +48,10 @@ B add_c1(B t, B x) { i64 ia = IA(x); INIT; \ void* xp = tyany_ptr(x); \ switch(xe) { default: UD; \ - case el_i8: for(usz i=0; i<ia; i++) { i8 c = ((i8* )xp)[i]; EXPR(i8, c==I8_MIN) } break; \ - case el_i16: for(usz i=0; i<ia; i++) { i16 c = ((i16*)xp)[i]; EXPR(i16, c==I16_MIN) } break; \ - case el_i32: for(usz i=0; i<ia; i++) { i32 c = ((i32*)xp)[i]; EXPR(i32, c==I32_MIN) } break; \ - case el_f64: for(usz i=0; i<ia; i++) { f64 c = ((f64*)xp)[i]; EXPR(f64, 0) } break; \ + case el_i8: for(usz i=0; i<ia; i++) { i8 c = ((i8* )xp)[i]; EXPR(i8, c==I8_MIN) } break; \ + case el_i16: for(usz i=0; i<ia; i++) { i16 c = ((i16*)xp)[i]; EXPR(i16, c==I16_MIN) } break; \ + case el_i32: for(usz i=0; i<ia; i++) { i32 c = ((i32*)xp)[i]; EXPR(i32, c==I32_MIN) } break; \ + case el_f64: vfor(usz i=0; i<ia; i++) { f64 c = ((f64*)xp)[i]; EXPR(f64, 0) } break; \ } \ decG(x); return r; POST \ } @@ -75,11 +75,11 @@ B add_c1(B t, B x) { #define STILE_BODY(FEXPR) LOOP_BODY(B r; void* rp = m_tyarrlc(&r, elWidth(xe), x, el2t(xe));, STILE_EXPR, bad: tyarr_freeF(v(r));) #endif -#define FLOAT_BODY(FEXPR) { i64 ia = IA(x); \ - assert(xe==el_f64); f64* xp = f64any_ptr(x); \ - f64* rp; B r = m_f64arrc(&rp, x); \ - for (usz i = 0; i < ia; i++) { f64 v=xp[i]; rp[i]=FEXPR; } \ - decG(x); return num_squeeze(r); \ +#define FLOAT_BODY(FEXPR) { i64 ia = IA(x); \ + assert(xe==el_f64); f64* xp = f64any_ptr(x); \ + f64* rp; B r = m_f64arrc(&rp, x); \ + vfor (usz i = 0; i < ia; i++) { f64 v=xp[i]; rp[i]=FEXPR; } \ + decG(x); return num_squeeze(r); \ } B sub_c2(B,B,B); #define SUB_BODY(FEXPR) return sub_c2(t, m_f64(0), x); @@ -101,7 +101,7 @@ GC1i("¬", not, 1-v, el_bit, bit_negate(x), NOT_BODY) u64 ia = IA(x); \ f64* xp = f64any_ptr(x); \ f64* rp; B r = m_f64arrc(&rp, x); \ - for (i64 i = 0; i < ia; i++) { \ + vfor (i64 i = 0; i < ia; i++) { \ f64 xv=xp[i]; rp[i] = (F); \ } \ decG(x); return r; \ diff --git a/src/builtins/cmp.c b/src/builtins/cmp.c index 3fa9c370..bd8aa0dd 100644 --- a/src/builtins/cmp.c +++ b/src/builtins/cmp.c @@ -69,7 +69,7 @@ CMP_REC(ne, ne, swapped=0;) #define CMP_AA0(N, T, BODY) void base_##N##AA##_##T(u64* r, void* w, void* x, u64 l) { BODY } #define CMP_AA1(N, T, OP) CMP_AA0(N, T, BASE_CMP_LOOP(OP, ((T*)w)[i], ((T*)x)[i])) #define CMP_AA_F(N, OP, BX) \ - CMP_AA0(N, u1, ({usz bia = BIT_N(l); for (usz i=0; i<bia; i++) { u64 wv=((u64*)w)[i], xv=((u64*)x)[i]; ((u64*)r)[i] = BX; }});) \ + CMP_AA0(N, u1, ({usz bia = BIT_N(l); vfor (usz i=0; i<bia; i++) { u64 wv=((u64*)w)[i], xv=((u64*)x)[i]; ((u64*)r)[i] = BX; }});) \ CMP_AA1(N, i8, OP) CMP_AA1(N, i16, OP) CMP_AA1(N, i32, OP) CMP_AA1(N, f64, OP) \ const CmpAAFn base_##N##AA##_u32 = base_##N##AA##_i32; @@ -104,7 +104,7 @@ CMP_REC(ne, ne, swapped=0;) #define CMP_SA1(N, T, Q, C, SLOW, OP) CMP_SA0(N, T, Q, SLOW, ({ T xv = C(x); BASE_CMP_LOOP(OP, ((T*)w)[i], xv) })) #define CMP_SA_F(N, OP, SLOW, BX) \ - CMP_SA0(N, u1, bit, SLOW, ({usz bia = BIT_N(l); u64 xv=bitx(x); for (usz i=0; i<bia; i++) { u64 wv=((u64*)w)[i]; ((u64*)r)[i] = BX; }})) \ + CMP_SA0(N, u1, bit, SLOW, ({usz bia = BIT_N(l); u64 xv=bitx(x); vfor (usz i=0; i<bia; i++) { u64 wv=((u64*)w)[i]; ((u64*)r)[i] = BX; }})) \ CMP_SA1(N,i8,i8,o2iG,SLOW,OP) CMP_SA1(N,i16,i16,o2iG,SLOW,OP) CMP_SA1(N,i32,i32,o2iG,SLOW,OP) CMP_SA1(N,f64,f64,o2fG,SLOW,OP) \ CMP_SA1(N,u8,c8,o2cG,SLOW,OP) CMP_SA1(N,u16,c16,o2cG,SLOW,OP) CMP_SA1(N,u32,c32,o2cG,SLOW,OP) diff --git a/src/builtins/grade.h b/src/builtins/grade.h index 80d3555d..7890d80d 100644 --- a/src/builtins/grade.h +++ b/src/builtins/grade.h @@ -319,12 +319,12 @@ B GRADE_CAT(c1)(B t, B x) { } TALLOC(I32I32p, tmp, ia); - for (usz i = 0; i < ia; i++) { + vfor (usz i = 0; i < ia; i++) { tmp[i].v = i; tmp[i].k = xp[i]; } CAT(GRADE_CAT(IP),tim_sort)(tmp, ia); - for (usz i = 0; i < ia; i++) rp[i] = tmp[i].v; + vfor (usz i = 0; i < ia; i++) rp[i] = tmp[i].v; TFREE(tmp); goto decG_sq; } diff --git a/src/builtins/search.c b/src/builtins/search.c index 8ce7d8d1..49221225 100644 --- a/src/builtins/search.c +++ b/src/builtins/search.c @@ -574,7 +574,7 @@ B asNormalized(B x, usz n, bool nanBad) { } #endif } else { - for (; i < n; i++) rp[i] = normalizeFloat(fp[i]); + vfor (; i < n; i++) rp[i] = normalizeFloat(fp[i]); } if (r.u!=x.u) decG(x); diff --git a/src/builtins/sfns.c b/src/builtins/sfns.c index 1fa22c73..2d6e7e07 100644 --- a/src/builtins/sfns.c +++ b/src/builtins/sfns.c @@ -1066,18 +1066,18 @@ B reverse_c1(B t, B x) { case 0: { u64* rp; r = m_bitarrc(&rp, x); u64* xp=xv; usz g = BIT_N(n); usz e = g-1; - for (usz i = 0; i < g; i++) rp[i] = bit_reverse(xp[e-i]); + vfor (usz i = 0; i < g; i++) rp[i] = bit_reverse(xp[e-i]); if (n&63) { u64 sh=(-n)&63; - for (usz i=0; i<e; i++) rp[i]=rp[i]>>sh|rp[i+1]<<(64-sh); + vfor (usz i=0; i<e; i++) rp[i]=rp[i]>>sh|rp[i+1]<<(64-sh); rp[e]>>=sh; } break; } - case 3: { u8* xp=xv; u8* rp = m_tyarrc(&r, 1, x, xt); for (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; } - case 4: { u16* xp=xv; u16* rp = m_tyarrc(&r, 2, x, xt); for (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; } - case 5: { u32* xp=xv; u32* rp = m_tyarrc(&r, 4, x, xt); for (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; } - case 6: if (TI(x,elType)!=el_B) { u64* xp=xv; u64* rp = m_tyarrc(&r, 8, x, xt); for (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; } + case 3: { u8* xp=xv; u8* rp = m_tyarrc(&r, 1, x, xt); vfor (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; } + case 4: { u16* xp=xv; u16* rp = m_tyarrc(&r, 2, x, xt); vfor (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; } + case 5: { u32* xp=xv; u32* rp = m_tyarrc(&r, 4, x, xt); vfor (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; } + case 6: if (TI(x,elType)!=el_B) { u64* xp=xv; u64* rp = m_tyarrc(&r, 8, x, xt); vfor (ux i=0; i<n; i++) rp[i]=xp[n-i-1]; break; } else { HArr_p rp = m_harrUc(x); B* xp = arr_bptr(x); diff --git a/src/builtins/slash.c b/src/builtins/slash.c index 310b6f51..b2555a37 100644 --- a/src/builtins/slash.c +++ b/src/builtins/slash.c @@ -299,7 +299,7 @@ static B where(B x, usz xia, u64 s) { usz bs; if (b>xia-i) { b=xia-i; bs=s-(rp-rp0); } else { bs=bit_sum(xp,b); } where_block_u16(xp, buf, b, bs); - for (usz j=0; j<bs; j++) rp[j] = i+buf[j]; + vfor (usz j=0; j<bs; j++) rp[j] = i+buf[j]; rp+= bs; xp+= b/64; } @@ -380,7 +380,7 @@ B grade_bool(B x, usz xia, bool up) { u64 xp0[4]; // 4 ≡ b/64 u64* xp1 = xp; for (usz i=0; i<xia; i+=b) { - for (usz j=0; j<BIT_N(b); j++) xp0[j] = ~xp1[j]; + vfor (usz j=0; j<BIT_N(b); j++) xp0[j] = ~xp1[j]; usz b2 = b>xia-i? xia-i : b; if (b2<b) { u64 q=b2%64; usz e=b2/64; u64 m=((u64)1<<q)-1; xp0[e]&=m; xp1[e]&=m; } usz s0=bit_sum(xp0,b2); si_1slash32(xp0, i, rp0, b2, s0); rp0+=s0; @@ -888,7 +888,7 @@ B slash_im(B t, B x) { for (usz i=0; i<xia; i++) t[(u##N)xp[i]]++; \ t[m/2]=xia; usz ria=0; for (u64 s=0; s<xia; ria++) s+=t[ria]; \ if (ria>m/2) thrM("/⁼: Argument cannot contain negative numbers"); \ - i32* rp; r = m_i32arrv(&rp, ria); for (usz i=0; i<ria; i++) rp[i]=t[i]; \ + i32* rp; r = m_i32arrv(&rp, ria); vfor (usz i=0; i<ria; i++) rp[i]=t[i]; \ TFREE(t); \ r = num_squeeze(r); \ } \ @@ -901,7 +901,7 @@ B slash_im(B t, B x) { i8 max = avx2_count_i8(t, (i8*)xp, xia, 0); \ if (max < 0) thrM("/⁼: Argument cannot contain negative numbers"); \ usz ria=max+1; \ - i32* rp; r = m_i32arrv(&rp, ria); for (usz i=0; i<ria; i++) rp[i]=t[i]; \ + i32* rp; r = m_i32arrv(&rp, ria); vfor (usz i=0; i<ria; i++) rp[i]=t[i]; \ TFREE(t); \ r = num_squeeze(r); \ } else diff --git a/src/builtins/sort.c b/src/builtins/sort.c index e803ba21..8492aac7 100644 --- a/src/builtins/sort.c +++ b/src/builtins/sort.c @@ -21,7 +21,7 @@ static NOINLINE void generic_grade(B x, usz ia, B r, i32* rp, void (*fn)(BI32p*, tmp[i].k = GetU(x,i); } fn(tmp, ia); - for (usz i = 0; i < ia; i++) rp[i] = tmp[i].v; + vfor (usz i = 0; i < ia; i++) rp[i] = tmp[i].v; TFREE(tmp); } diff --git a/src/builtins/sysfn.c b/src/builtins/sysfn.c index 81e9675e..d585773a 100644 --- a/src/builtins/sysfn.c +++ b/src/builtins/sysfn.c @@ -1566,7 +1566,7 @@ B bitop1(B f, B x, enum BitOp1 op, char* name) { case op_neg: switch (ow) { default: thrF("•bit._%U: unhandled width %s", name, ow); #define CASE(W) case W: \ - NOUNROLL for (usz i=0; i<n/W; i++) ((u##W*)rp)[i] = -((u##W*)xp)[i]; \ + NOUNROLL vfor (usz i=0; i<n/W; i++) ((u##W*)rp)[i] = -((u##W*)xp)[i]; \ break; CASE(8) CASE(16) CASE(32) CASE(64) #undef CASE @@ -1643,11 +1643,11 @@ B bitop2(B f, B w, B x, enum BitOp2 op, char* name) { } if (noextend) { #define BINOP(O,P) case op_##O: { \ - usz l = n/64; NOUNROLL for (usz i=0; i<l; i++) rp[i] = wp[i] P xp[i]; \ + usz l = n/64; NOUNROLL vfor (usz i=0; i<l; i++) rp[i] = wp[i] P xp[i]; \ usz q = (-n)%64; if (q) rp[l] ^= (~(u64)0 >> q) & (rp[l]^(wp[l] P xp[l])); \ } break; #define CASE(W, Q, P) case W: \ - NOUNROLL for (usz i=0; i<n/W; i++) \ + NOUNROLL vfor (usz i=0; i<n/W; i++) \ ((Q##W*)rp)[i] = ((Q##W*)wp)[i] P ((Q##W*)xp)[i]; \ break; SWITCH @@ -1659,12 +1659,12 @@ B bitop2(B f, B w, B x, enum BitOp2 op, char* name) { if (ow>64) thrF("•bit._%U: scalar extension with width over 64 unhandled", name); \ u64 wv = *wp & (~(u64)0>>(64-ow)); \ for (usz tw=ow; tw<64; tw*=2) wv|=wv<<tw; \ - usz l = n/64; NOUNROLL for (usz i=0; i<l; i++) rp[i] = wv P xp[i]; \ + usz l = n/64; NOUNROLL vfor (usz i=0; i<l; i++) rp[i] = wv P xp[i]; \ usz q = (-n)%64; if (q) rp[l] ^= (~(u64)0 >> q) & (rp[l]^(wv P xp[l])); \ } break; #define CASE(W, Q, P) case W: { \ Q##W wv = *(Q##W*)wp; \ - NOUNROLL for (usz i=0; i<n/W; i++) \ + NOUNROLL vfor (usz i=0; i<n/W; i++) \ ((Q##W*)rp)[i] = wv P ((Q##W*)xp)[i]; \ } break; SWITCH diff --git a/src/core/harr.c b/src/core/harr.c index 9db881da..1b00a10a 100644 --- a/src/core/harr.c +++ b/src/core/harr.c @@ -4,7 +4,7 @@ NOINLINE B m_caB(usz ia, B* a) { HArr_p r = m_harrUv(ia); - for (usz i = 0; i < ia; i++) r.a[i] = a[i]; + vfor (usz i = 0; i < ia; i++) r.a[i] = a[i]; NOGC_E; return r.b; } @@ -78,7 +78,7 @@ BQN_EXP size_t bqn_rank(BQNV a) { return RNK(getB(a)); } BQN_EXP void bqn_shape(BQNV a, size_t* buf) { B b = getB(a); ur r = RNK(b); usz* sh = SH(b); - for (usz i = 0; i < r; i++) buf[i] = sh[i]; + vfor (usz i = 0; i < r; i++) buf[i] = sh[i]; } BQN_EXP BQNV bqn_pick(BQNV a, size_t pos) { return makeX(IGet(getB(a),pos)); @@ -590,7 +590,7 @@ FORCE_INLINE u64 i64abs(i64 x) { return x<0?-x:x; } usz ia = IA(x); \ B t = WIDEN(x); WEL* tp = WEL##any_ptr(t); \ REL* rp; B r = m_##REL##arrv(&rp, ia); \ - for (usz i=0; i<ia; i++) ((UEL*)rp)[i] = tp[i]; \ + vfor (usz i=0; i<ia; i++) ((UEL*)rp)[i] = tp[i];\ decG(t); return r; // copy elements of x to array of unsigned integers (using a signed integer array type as a "container"); consumes argument @@ -603,7 +603,7 @@ NOINLINE B cpyF32Bits(B x) { // copy x to a 32-bit float array (using an i32arr usz ia = IA(x); B t = toF64Any(x); f64* tp = f64any_ptr(t); i32* rp; B r = m_i32arrv(&rp, ia); - for (usz i=0; i<ia; i++) ((f32*)rp)[i]=tp[i]; + vfor (usz i=0; i<ia; i++) ((f32*)rp)[i]=tp[i]; dec(t); return r; } @@ -613,10 +613,10 @@ static B toU16Bits(B x) { return TI(x,elType)==el_i16? x : cpyU16Bits(x); } static B toU8Bits(B x) { return TI(x,elType)==el_i8? x : cpyU8Bits(x); } // read x as the specified type (assuming a container of the respective width signed integer array); consumes x -NOINLINE B readU8Bits(B x) { usz ia=IA(x); u8* xp=tyarr_ptr(x); i16* rp; B r=m_i16arrv(&rp, ia); for (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); } -NOINLINE B readU16Bits(B x) { usz ia=IA(x); u16* xp=tyarr_ptr(x); i32* rp; B r=m_i32arrv(&rp, ia); for (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); } -NOINLINE B readU32Bits(B x) { usz ia=IA(x); u32* xp=tyarr_ptr(x); f64* rp; B r=m_f64arrv(&rp, ia); for (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); } -NOINLINE B readF32Bits(B x) { usz ia=IA(x); f32* xp=tyarr_ptr(x); f64* rp; B r=m_f64arrv(&rp, ia); for (usz i=0; i<ia; i++) rp[i]=xp[i]; return r; } +NOINLINE B readU8Bits(B x) { usz ia=IA(x); u8* xp=tyarr_ptr(x); i16* rp; B r=m_i16arrv(&rp, ia); vfor (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); } +NOINLINE B readU16Bits(B x) { usz ia=IA(x); u16* xp=tyarr_ptr(x); i32* rp; B r=m_i32arrv(&rp, ia); vfor (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); } +NOINLINE B readU32Bits(B x) { usz ia=IA(x); u32* xp=tyarr_ptr(x); f64* rp; B r=m_f64arrv(&rp, ia); vfor (usz i=0; i<ia; i++) rp[i]=xp[i]; return num_squeeze(r); } +NOINLINE B readF32Bits(B x) { usz ia=IA(x); f32* xp=tyarr_ptr(x); f64* rp; B r=m_f64arrv(&rp, ia); vfor (usz i=0; i<ia; i++) rp[i]=xp[i]; return r; } B m_ptrobj_s(void* ptr, B o); // consumes o, sets stride to size of o B m_ptrobj(void* ptr, B o, ux stride); // consumes o static NOINLINE B ptrobj_checkget(B x); // doesn't consume @@ -123,13 +123,16 @@ typedef size_t ux; #if __clang__ #define NOUNROLL _Pragma("clang loop unroll(disable)") #define NOVECTORIZE _Pragma("clang loop vectorize(disable)") + #define vfor _Pragma("clang loop vectorize(assume_safety)") for #elif __GNUC__ #define EXACTLY_GCC 1 #define NOUNROLL _Pragma("GCC unroll 1") + #define vfor _Pragma("GCC ivdep") for #define NOVECTORIZE #else #define NOUNROLL #define NOVECTORIZE + #define vfor for #endif #define PLAINLOOP NOUNROLL NOVECTORIZE #if EXACTLY_GCC diff --git a/src/utils/mut.c b/src/utils/mut.c index 4a538a08..1deea741 100644 --- a/src/utils/mut.c +++ b/src/utils/mut.c @@ -289,12 +289,12 @@ DEF_G(void, copy, B, (void* a, usz ms, B x, usz xs, usz l), ms, x, x B* mpo = ms+(B*)a; switch(TY(x)) { case t_bitarr: { u64* xp = bitarr_ptr(x); for (usz i = 0; i < l; i++) mpo[i] = m_i32(bitp_get(xp, xs+i)); return; } - case t_i8arr: case t_i8slice: { i8* xp = i8any_ptr (x); for (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; } - case t_i16arr: case t_i16slice: { i16* xp = i16any_ptr(x); for (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; } - case t_i32arr: case t_i32slice: { i32* xp = i32any_ptr(x); for (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; } - case t_c8arr: case t_c8slice: { u8* xp = c8any_ptr (x); for (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; } - case t_c16arr: case t_c16slice: { u16* xp = c16any_ptr(x); for (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; } - case t_c32arr: case t_c32slice: { u32* xp = c32any_ptr(x); for (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; } + case t_i8arr: case t_i8slice: { i8* xp = i8any_ptr (x); vfor (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; } + case t_i16arr: case t_i16slice: { i16* xp = i16any_ptr(x); vfor (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; } + case t_i32arr: case t_i32slice: { i32* xp = i32any_ptr(x); vfor (usz i = 0; i < l; i++) mpo[i] = m_i32(xp[i+xs]); return; } + case t_c8arr: case t_c8slice: { u8* xp = c8any_ptr (x); vfor (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; } + case t_c16arr: case t_c16slice: { u16* xp = c16any_ptr(x); vfor (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; } + case t_c32arr: case t_c32slice: { u32* xp = c32any_ptr(x); vfor (usz i = 0; i < l; i++) mpo[i] = m_c32(xp[i+xs]); return; } case t_harr: case t_hslice: case t_fillarr: case t_fillslice:; B* xp = arr_bptr(x)+xs; for (usz i = 0; i < l; i++) inc(xp[i]); @@ -393,15 +393,15 @@ DEF_G(void, copy, B, (void* a, usz ms, B x, usz xs, usz l), ms, x, x E* rp; Arr* r = m_##E##arrp(&rp, ia); \ arr_shCopy(r, x); \ u8 xe = TI(x,elType); \ - if (xe==el_bit) { u64* xp = bitarr_ptr(x); for(usz i=0; i<ia; i++) rp[i]=bitp_get(xp,i); } \ - else if (xe==el_i8 ) { i8* xp = i8any_ptr (x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ - else if (xe==el_i16) { i16* xp = i16any_ptr(x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ - else if (xe==el_i32) { i32* xp = i32any_ptr(x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ - else if (xe==el_f64) { f64* xp = f64any_ptr(x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ + if (xe==el_bit) { u64* xp = bitarr_ptr(x); for(usz i=0; i<ia; i++) rp[i]=bitp_get(xp,i); } \ + else if (xe==el_i8 ) { i8* xp = i8any_ptr (x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ + else if (xe==el_i16) { i16* xp = i16any_ptr(x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ + else if (xe==el_i32) { i32* xp = i32any_ptr(x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ + else if (xe==el_f64) { f64* xp = f64any_ptr(x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ else { \ B* xp = arr_bptr(x); \ - if (xp!=NULL) { for (usz i=0; i<ia; i++) rp[i]=o2fG(xp[i] ); } \ - else { SGetU(x) for (usz i=0; i<ia; i++) rp[i]=o2fG(GetU(x,i)); } \ + if (xp!=NULL) { vfor (usz i=0; i<ia; i++) rp[i]=o2fG(xp[i] ); } \ + else { SGetU(x) for (usz i=0; i<ia; i++) rp[i]=o2fG(GetU(x,i)); } \ } \ ptr_decT(a(x)); \ return r; \ @@ -413,13 +413,13 @@ DEF_G(void, copy, B, (void* a, usz ms, B x, usz xs, usz l), ms, x, x T##Atom* rp; Arr* r = m_##E##arrp(&rp, ia); \ arr_shCopy(r, x); \ u8 xe = TI(x,elType); \ - if (xe==el_c8 ) { u8* xp = c8any_ptr (x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ - else if (xe==el_c16) { u16* xp = c16any_ptr(x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ - else if (xe==el_c32) { u32* xp = c32any_ptr(x); for(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ + if (xe==el_c8 ) { u8* xp = c8any_ptr (x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ + else if (xe==el_c16) { u16* xp = c16any_ptr(x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ + else if (xe==el_c32) { u32* xp = c32any_ptr(x); vfor(usz i=0; i<ia; i++) rp[i]=xp[i]; } \ else { \ B* xp = arr_bptr(x); \ - if (xp!=NULL) { for (usz i=0; i<ia; i++) rp[i]=o2cG(xp[i] ); } \ - else { SGetU(x) for (usz i=0; i<ia; i++) rp[i]=o2cG(GetU(x,i)); } \ + if (xp!=NULL) { vfor (usz i=0; i<ia; i++) rp[i]=o2cG(xp[i] ); } \ + else { SGetU(x) for (usz i=0; i<ia; i++) rp[i]=o2cG(GetU(x,i)); } \ } \ ptr_decT(a(x)); \ return r; \ @@ -429,14 +429,14 @@ DEF_G(void, copy, B, (void* a, usz ms, B x, usz xs, usz l), ms, x, x usz ia = IA(x); HArr_p r = m_harrUc(x); u8 xe = TI(x,elType); - if (xe==el_bit) { u64* xp = bitarr_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(bitp_get(xp, i)); } - else if (xe==el_i8 ) { i8* xp = i8any_ptr (x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); } - else if (xe==el_i16) { i16* xp = i16any_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); } - else if (xe==el_i32) { i32* xp = i32any_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); } - else if (xe==el_f64) { f64* xp = f64any_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); } - else if (xe==el_c8 ) { u8* xp = c8any_ptr (x); for(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); } - else if (xe==el_c16) { u16* xp = c16any_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); } - else if (xe==el_c32) { u32* xp = c32any_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); } + if (xe==el_bit) { u64* xp = bitarr_ptr(x); for(usz i=0; i<ia; i++) r.a[i]=m_f64(bitp_get(xp, i)); } + else if (xe==el_i8 ) { i8* xp = i8any_ptr (x); vfor(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); } + else if (xe==el_i16) { i16* xp = i16any_ptr(x); vfor(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); } + else if (xe==el_i32) { i32* xp = i32any_ptr(x); vfor(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); } + else if (xe==el_f64) { f64* xp = f64any_ptr(x); vfor(usz i=0; i<ia; i++) r.a[i]=m_f64(xp[i]); } + else if (xe==el_c8 ) { u8* xp = c8any_ptr (x); vfor(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); } + else if (xe==el_c16) { u16* xp = c16any_ptr(x); vfor(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); } + else if (xe==el_c32) { u32* xp = c32any_ptr(x); vfor(usz i=0; i<ia; i++) r.a[i]=m_c32(xp[i]); } else { B* xp = arr_bptr(x); if (xp!=NULL) { for (usz i=0; i<ia; i++) r.a[i] = inc(xp[i]); } @@ -451,7 +451,7 @@ DEF_G(void, copy, B, (void* a, usz ms, B x, usz xs, usz l), ms, x, x u64* rp; Arr* r = m_bitarrp(&rp, ia); arr_shCopy(r, x); u8 xe = TI(x,elType); - if (xe==el_bit) { u64* xp = bitarr_ptr(x); for(usz i=0; i<BIT_N(ia); i++) rp[i] = xp[i]; } + if (xe==el_bit) { u64* xp = bitarr_ptr(x); vfor(usz i=0; i<BIT_N(ia); i++) rp[i] = xp[i]; } else if (xe==el_i8 ) { i8* xp = i8any_ptr (x); for(usz i=0; i<ia; i++) bitp_set(rp,i,xp[i]); } else if (xe==el_i16) { i16* xp = i16any_ptr(x); for(usz i=0; i<ia; i++) bitp_set(rp,i,xp[i]); } else if (xe==el_i32) { i32* xp = i32any_ptr(x); for(usz i=0; i<ia; i++) bitp_set(rp,i,xp[i]); } |