MODULE WMRasterScale; (** AUTHOR "TF"; PURPOSE "Support scaling of images"; *) (** AUTHOR "MZ"; PURPOSE "Speedup rasterops with SSE2"; *) IMPORT SYSTEM, Raster, Rect := WMRectangles; CONST (** Copy Modes *) ModeCopy* = 0; ModeSrcOverDst* = 1; (** Scale Modes *) ScaleBox* = 0; ScaleBilinear* = 1; TYPE Rectangle = Rect.Rectangle; Image = Raster.Image; ScalerProc = PROCEDURE (src, dst : Image; VAR dr : Rectangle; sx, sy, sdx, sdy : LONGINT); XScalerProc = PROCEDURE (srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); (* copy sr in 16.16 fix rectangle from src to dr integer rectangle in dst *) PROCEDURE Q0GenericCopy(src, dst : Image; VAR dr : Rectangle; sx, sy, sdx, sdy : LONGINT); VAR x, y : LONGINT; col : Raster.Pixel; getMode, putMode : Raster.Mode; fx, fy : LONGINT; BEGIN Raster.InitMode(getMode, Raster.srcCopy); Raster.InitMode(putMode, Raster.srcCopy); fy := sy; FOR y := dr.t TO dr.b - 1 DO fx := sx; FOR x := dr.l TO dr.r - 1 DO Raster.Get(src, fx DIV 65536, fy DIV 65536, col, getMode); INC(fx, sdx); Raster.Put(dst, x, y, col, putMode) END; INC(fy, sdy) END END Q0GenericCopy; PROCEDURE Q0GenericSrcOverDst(src, dst : Image; VAR dr : Rectangle; sx, sy, sdx, sdy : LONGINT); VAR x, y : LONGINT; col : Raster.Pixel; getMode, putMode : Raster.Mode; fx, fy : LONGINT; BEGIN Raster.InitMode(getMode, Raster.srcCopy); Raster.InitMode(putMode, Raster.srcOverDst); fy := sy; FOR y := dr.t TO dr.b - 1 DO fx := sx; FOR x := dr.l TO dr.r - 1 DO Raster.Get(src, fx DIV 65536, fy DIV 65536, col, getMode); INC(fx, sdx); Raster.Put(dst, x, y, col, putMode) END; INC(fy, sdy) END END Q0GenericSrcOverDst; (* copy sr in 16.16 fix rectangle from src to dr integer rectangle in dst *) PROCEDURE Q1GenericCopy(src, dst : Image; VAR dr : Rectangle; sx, sy, sdx, sdy : LONGINT); VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; col, col0, col1, col2, col3 : Raster.Pixel; b0, g0, r0, a0, b1, g1, r1, a1, cb, cg, cr, ca : LONGINT; getMode, putMode : Raster.Mode; fx, fy : LONGINT; x0, x1, y0, y1 : LONGINT; BEGIN Raster.InitMode(getMode, Raster.srcCopy); Raster.InitMode(putMode, Raster.srcCopy); fy := sy - 8000H; sx := sx - 8000H; FOR y := dr.t TO dr.b - 1 DO fx := sx; y0 := Bounds(fy DIV 65536, 0, src.height - 1); y1 := Bounds(fy DIV 65536 + 1, 0, src.height - 1); FOR x := dr.l TO dr.r - 1 DO x0 := Bounds(fx DIV 65536, 0, src.width - 1); x1 := Bounds(fx DIV 65536 + 1, 0, src.width - 1); Raster.Get(src, x0, y0, col0, getMode); Raster.Get(src, x1, y0, col1, getMode); Raster.Get(src, x0, y1, col2, getMode); Raster.Get(src, x1, y1, col3, getMode); xfleft := (65536 - fx MOD 65536); xfright := (fx MOD 65536); b0 := (ORD(col0[Raster.b]) * xfleft + ORD(col1[Raster.b]) * xfright) DIV 65536; g0 := (ORD(col0[Raster.g]) * xfleft + ORD(col1[Raster.g]) * xfright) DIV 65536; r0 := (ORD(col0[Raster.r]) * xfleft + ORD(col1[Raster.r]) * xfright) DIV 65536; a0 := (ORD(col0[Raster.a]) * xfleft + ORD(col1[Raster.a]) * xfright) DIV 65536; b1 := (ORD(col2[Raster.b]) * xfleft + ORD(col3[Raster.b]) * xfright) DIV 65536; g1 := (ORD(col2[Raster.g]) * xfleft + ORD(col3[Raster.g]) * xfright) DIV 65536; r1 := (ORD(col2[Raster.r]) * xfleft + ORD(col3[Raster.r]) * xfright) DIV 65536; a1 := (ORD(col2[Raster.a]) * xfleft + ORD(col3[Raster.a]) * xfright) DIV 65536; yftop := (65536 - fy MOD 65536); yfbottom := (fy MOD 65536); cb := (b0 * yftop + b1 * yfbottom) DIV 65536; cg := (g0 * yftop + g1 * yfbottom) DIV 65536; cr := (r0 * yftop + r1 * yfbottom) DIV 65536; ca := (a0 * yftop + a1 * yfbottom) DIV 65536; col[Raster.b] := CHR(cb); col[Raster.g] := CHR(cg); col[Raster.r] := CHR(cr); col[Raster.a] := CHR(ca); INC(fx, sdx); Raster.Put(dst, x, y, col, putMode) END; INC(fy, sdy) END END Q1GenericCopy; (* copy sr in 16.16 fix rectangle from src to dr integer rectangle in dst *) PROCEDURE Q1GenericSrcOverDst(src, dst : Image; VAR dr : Rectangle; sx, sy, sdx, sdy : LONGINT); VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; col, col0, col1, col2, col3 : Raster.Pixel; b0, g0, r0, a0, b1, g1, r1, a1, cb, cg, cr, ca : LONGINT; getMode, putMode : Raster.Mode; fx, fy : LONGINT; x0, x1, y0, y1 : LONGINT; BEGIN Raster.InitMode(getMode, Raster.srcCopy); Raster.InitMode(putMode, Raster.srcOverDst); fy := sy - 8000H; sx := sx - 8000H; FOR y := dr.t TO dr.b - 1 DO fx := sx; y0 := Bounds(fy DIV 65536, 0, src.height - 1); y1 := Bounds(fy DIV 65536 + 1, 0, src.height - 1); FOR x := dr.l TO dr.r - 1 DO x0 := Bounds(fx DIV 65536, 0, src.width - 1); x1 := Bounds(fx DIV 65536 + 1, 0, src.width - 1); Raster.Get(src, x0, y0, col0, getMode); Raster.Get(src, x1, y0, col1, getMode); Raster.Get(src, x0, y1, col2, getMode); Raster.Get(src, x1, y1, col3, getMode); xfleft := (65536 - fx MOD 65536); xfright := (fx MOD 65536); b0 := (ORD(col0[Raster.b]) * xfleft + ORD(col1[Raster.b]) * xfright) DIV 65536; g0 := (ORD(col0[Raster.g]) * xfleft + ORD(col1[Raster.g]) * xfright) DIV 65536; r0 := (ORD(col0[Raster.r]) * xfleft + ORD(col1[Raster.r]) * xfright) DIV 65536; a0 := (ORD(col0[Raster.a]) * xfleft + ORD(col1[Raster.a]) * xfright) DIV 65536; b1 := (ORD(col2[Raster.b]) * xfleft + ORD(col3[Raster.b]) * xfright) DIV 65536; g1 := (ORD(col2[Raster.g]) * xfleft + ORD(col3[Raster.g]) * xfright) DIV 65536; r1 := (ORD(col2[Raster.r]) * xfleft + ORD(col3[Raster.r]) * xfright) DIV 65536; a1 := (ORD(col2[Raster.a]) * xfleft + ORD(col3[Raster.a]) * xfright) DIV 65536; yftop := (65536 - fy MOD 65536); yfbottom := (fy MOD 65536); cb := (b0 * yftop + b1 * yfbottom) DIV 65536; cg := (g0 * yftop + g1 * yfbottom) DIV 65536; cr := (r0 * yftop + r1 * yfbottom) DIV 65536; ca := (a0 * yftop + a1 * yfbottom) DIV 65536; col[Raster.b] := CHR(cb); col[Raster.g] := CHR(cg); col[Raster.r] := CHR(cr); col[Raster.a] := CHR(ca); INC(fx, sdx); Raster.Put(dst, x, y, col, putMode) END; INC(fy, sdy) END END Q1GenericSrcOverDst; PROCEDURE XQ0BGR565BGR565(srcadr,dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR x, y : LONGINT; yadr, adr, sa, col : LONGINT; fx, fy : LONGINT; BEGIN fy := sy; yadr := dstadr + dl * 2 + dt * dstbpr; FOR y := dt TO db - 1 DO fx := sx; adr := yadr; sa := srcadr + (fy DIV 65536) * srcbpr; FOR x := dl TO dr - 1 DO col := SYSTEM.GET16(sa + (fx DIV 65536) * 2); INC(fx, sdx); SYSTEM.PUT16(adr, col); INC(adr, 2); END; INC(fy, sdy); INC(yadr, dstbpr) END END XQ0BGR565BGR565; (* (* this asm version is 2.3 times faster than the portable version. (P3/600/Dell precision 420 (dual)) *) PROCEDURE XQ0BGR565BGR565(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR yadr : LONGINT; CODE {SYSTEM.i386} MOV EDX, [EBP+dstadr] MOV EBX, [EBP+dl] SHL EBX, 1 ADD EDX, EBX MOV EBX, [EBP+dt] IMUL EBX, [EBP+dstbpr] ADD EDX, EBX ; edx = dstadr + 2 * dl + dt * dstbpr MOV [EBP+yadr], EDX ; init first EDI MOV EDI, EDX MOV ECX, [EBP+dt] SUB [EBP+db], ECX ; counter in db MOV EDX, [EBP+sdx] ; keep EDX ; init first ESI MOV ESI, [EBP+srcadr] ; calc new source adr MOV EAX, [EBP+sy] SHR EAX, 16 ; integer part of sy IMUL EAX, [EBP+srcbpr] ; sy * srcbpr ADD ESI, EAX ; first source adr in ESI outerloop: MOV EBX, [EBP+sx] MOV ECX, [EBP+dr] ; FOR x := dl TO dr - 1 DO SUB ECX, [EBP+dl] innerloop: MOV EAX, EBX SHR EAX, 16 MOV AX, WORD [ESI + EAX * 2] ; read the pixel ADD EBX, EDX ; INC fx, sdx MOV [EDI], AX ; set the pixel ADD EDI, 2 ; inc adr LOOP innerloop ; free : EAX, EBX, ECX MOV EAX, [EBP+sy] ; sy := sy + sdy ADD EAX, [EBP+sdy] MOV [EBP+sy], EAX ; keep sy in EAX MOV ESI, [EBP+srcadr] ; calc new source adr SHR EAX, 16 ; integer part of sy IMUL EAX, [EBP+srcbpr] ; sy * srcbpr ADD ESI, EAX ; new source adr in ESI ; new dst address MOV ECX, [EBP+dstbpr] MOV EAX, [EBP+yadr] ADD EAX, ECX MOV EDI, EAX MOV [EBP+yadr], EAX DEC DWORD [EBP+db] JNLE outerloop END XQ0BGR565BGR565; *) (*PROCEDURE SSE2Q0BGR565BGR565(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT (*; VAR mysrc, mydest, myres: ARRAY OF LONGINT*)); VAR yadr : LONGINT; CODE {SYSTEM.i386, SYSTEM.MMX, SYSTEM.SSE, SYSTEM.SSE2} PUSHFD PUSH EBX ; CLI MOV EDX, [EBP+dstadr] MOV EBX, [EBP+dl] SHL EBX, 1 ADD EDX, EBX MOV EBX, [EBP+dt] IMUL EBX, [EBP+dstbpr] ADD EDX, EBX ; edx = dstadr + 2 * dl + dt * dstbpr MOV [EBP+yadr], EDX ; init first EDI MOV EDI, EDX MOV ECX, [EBP+dt] SUB [EBP+db], ECX ; counter in db JLE endyloop MOV EDX, [EBP+sdx] ; keep EDX ; init first ESI MOV ESI, [EBP+srcadr] ; calc new source adr MOV EAX, [EBP+sy] SHR EAX, 16 ; integer part of sy IMUL EAX, [EBP+srcbpr] ; sy * srcbpr ADD ESI, EAX ; first source adr in ESI outerloop: MOV EBX, [EBP+sx] MOV ECX, [EBP+dr] ; FOR x := dl TO dr - 1 DO SUB ECX, [EBP+dl] JLE endyloop innerloop: CMP ECX, 8 JLE singlepixel PXOR XMM0, XMM0 ; 8pixels at the time MOV EAX, EBX SHR EAX, 16 MOV AX, WORD [ESI + EAX * 2] ; read the pixel PINSRW XMM0, EAX,0 ADD EBX, EDX ; INC fx, sdx MOV EAX, EBX SHR EAX, 16 MOV AX, WORD [ESI + EAX * 2] ; read the pixel PINSRW XMM0, EAX,1 ADD EBX, EDX ; INC fx, sdx MOV EAX, EBX SHR EAX, 16 MOV AX, WORD [ESI + EAX * 2] ; read the pixel PINSRW XMM0, EAX,2 ADD EBX, EDX ; INC fx, sdx MOV EAX, EBX SHR EAX, 16 MOV AX, WORD [ESI + EAX * 2] ; read the pixel PINSRW XMM0, EAX,3 ADD EBX, EDX ; INC fx, sdx MOV EAX, EBX SHR EAX, 16 MOV AX, WORD [ESI + EAX * 2] ; read the pixel PINSRW XMM0, EAX,4 ADD EBX, EDX ; INC fx, sdx MOV EAX, EBX SHR EAX, 16 MOV AX, WORD [ESI + EAX * 2] ; read the pixel PINSRW XMM0, EAX,5 ADD EBX, EDX ; INC fx, sdx MOV EAX, EBX SHR EAX, 16 MOV AX, WORD [ESI + EAX * 2] ; read the pixel PINSRW XMM0, EAX,6 ADD EBX, EDX ; INC fx, sdx MOV EAX, EBX SHR EAX, 16 MOV AX, WORD [ESI + EAX * 2] ; read the pixel PINSRW XMM0, EAX,7 ADD EBX, EDX ; INC fx, sdx MOVDQU [EDI], XMM0 ; MOV [EDI], AX ; set the pixels ADD EDI, 16 ; inc adr SUB ECX, 8 CMP ECX, 0 JE outside2 ; LOOP innerloop JMP innerloop singlepixel: MOV EAX, EBX SHR EAX, 16 MOV AX, WORD [ESI + EAX * 2] ; read the pixel ADD EBX, EDX ; INC fx, sdx MOV [EDI], AX ; set the pixel ADD EDI, 2 ; inc adr SUB ECX, 1 CMP ECX, 0 JE outside2 ; LOOP innerloop JMP innerloop outside2: ; free : EAX, EBX, ECX MOV EAX, [EBP+sy] ; sy := sy + sdy ADD EAX, [EBP+sdy] MOV [EBP+sy], EAX ; keep sy in EAX MOV ESI, [EBP+srcadr] ; calc new source adr SHR EAX, 16 ; integer part of sy IMUL EAX, [EBP+srcbpr] ; sy * srcbpr ADD ESI, EAX ; new source adr in ESI ; new dst address MOV ECX, [EBP+dstbpr] MOV EAX, [EBP+yadr] ADD EAX, ECX MOV EDI, EAX MOV [EBP+yadr], EAX DEC DWORD [EBP+db] JNLE outerloop endyloop: EMMS ; declare FPU registers free POP EBX POPFD END SSE2Q0BGR565BGR565; *) PROCEDURE Q1BGR565BGR565(srcadr,dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr, adr: ADDRESS; col0, col1, col2, col3 : LONGINT; b0, g0, r0, b1, g1, r1, cb, cg, cr : LONGINT; fx, fy, xadd1, xadd2 : LONGINT; yadd1, yadd2: ADDRESS; BEGIN yadr := dstadr + dl * 2 + dt * dstbpr; fy := sy - 8000H; sx := sx - 8000H; FOR y := dt TO db - 1 DO fx := sx; adr := yadr; yadd1 := srcadr + Bounds(fy DIV 65536, 0, sh - 1) * srcbpr; yadd2 := srcadr + Bounds(fy DIV 65536 + 1, 0, sh - 1) * srcbpr; FOR x := dl TO dr - 1 DO xadd1 := Bounds(fx DIV 65536, 0, sw - 1) * 2; xadd2 := Bounds(fx DIV 65536 + 1, 0, sw - 1) * 2; col0 := SYSTEM.GET16(yadd1 + xadd1); col1 := SYSTEM.GET16(yadd1 + xadd2); col2 := SYSTEM.GET16(yadd2 + xadd1); col3 := SYSTEM.GET16(yadd2 + xadd2); xfleft := (65536 - fx MOD 65536); xfright := (fx MOD 65536); b0 := ((col0 MOD 32) * 8 * xfleft + (col1 MOD 32) * 8 * xfright) DIV 65536; g0 := ((col0 DIV 32 MOD 64) * 4 * xfleft + (col1 DIV 32 MOD 64) * 4 * xfright) DIV 65536; r0 := ((col0 DIV 2048 MOD 32) * 8 * xfleft + (col1 DIV 2048 MOD 32) * 8 * xfright) DIV 65536; b1 := ((col2 MOD 32) * 8 * xfleft + (col3 MOD 32) * 8 * xfright) DIV 65536; g1 := ((col2 DIV 32 MOD 64) * 4 * xfleft + (col3 DIV 32 MOD 64) * 4 * xfright) DIV 65536; r1 := ((col2 DIV 2048 MOD 32) * 8 * xfleft + (col3 DIV 2048 MOD 32) * 8 * xfright) DIV 65536; yftop := (65536 - fy MOD 65536); yfbottom := (fy MOD 65536); cb := (b0 * yftop + b1 * yfbottom) DIV 65536; cg := (g0 * yftop + g1 * yfbottom) DIV 65536; cr := (r0 * yftop + r1 * yfbottom) DIV 65536; INC(fx, sdx); SYSTEM.PUT16(adr, ASH(cb, -3) + ASH(ASH(cg, -2), 5) + ASH(ASH(cr, -3), 11)); INC(adr, 2); END; INC(fy, sdy); INC(yadr, dstbpr) END END Q1BGR565BGR565; PROCEDURE SSE2Q1BGR565BGR565(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr, adr, col, col0, col1, col2, col3 : LONGINT; b0, g0, r0, b1, g1, r1, cb, cg, cr : LONGINT; fx, fy, yadd1, yadd2, xadd1, xadd2 : LONGINT; END SSE2Q1BGR565BGR565; PROCEDURE Q1BGRA8888BGR565(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr: ADDRESS; col, col0, col1, col2, col3 : LONGINT; b0, g0, r0, a0, b1, g1, r1, a1, cb, cg, cr, ca, dstb, dstg, dstr : LONGINT; fx, fy, xadd0, xadd1: LONGINT; yadd0, yadd1: ADDRESS; BEGIN yadr := dstadr + dl * 2 + dt * dstbpr; fy := sy - 8000H; sx := sx - 8000H; FOR y := dt TO db - 1 DO fx := sx; dstadr := yadr; yadd0 := srcadr + Bounds(fy DIV 65536, 0, sh - 1) * srcbpr; yadd1 := srcadr + Bounds(fy DIV 65536 + 1, 0, sh - 1) * srcbpr; FOR x := dl TO dr - 1 DO (* destination color *) col := SYSTEM.GET16(dstadr); dstb := (col MOD 32) * 8; dstg := (col DIV 32 MOD 64) * 4; dstr := (col DIV 2048 MOD 32) * 8; xadd0 := Bounds(fx DIV 65536, 0, sw - 1) * 4; xadd1 := Bounds(fx DIV 65536 + 1, 0, sw - 1) * 4; col0 := SYSTEM.GET32(yadd0 + xadd0); col1 := SYSTEM.GET32(yadd0 + xadd1); col2 := SYSTEM.GET32(yadd1 + xadd0); col3 := SYSTEM.GET32(yadd1 + xadd1); xfleft := (65536 - fx MOD 65536); xfright := (fx MOD 65536); yftop := (65536 - fy MOD 65536); yfbottom := (fy MOD 65536); a0 := ((col0 DIV 1000000H MOD 100H) * xfleft + (col1 DIV 1000000H MOD 100H) * xfright) DIV 65536; a1 := ((col2 DIV 1000000H MOD 100H) * xfleft + (col3 DIV 1000000H MOD 100H) * xfright) DIV 65536; ca := (a0 * yftop + a1 * yfbottom) DIV 65536; IF ca # 0 THEN b0 := ((col0 MOD 100H) * xfleft + (col1 MOD 100H) * xfright) DIV 65536; g0 := ((col0 DIV 100H MOD 100H) * xfleft + (col1 DIV 100H MOD 100H) * xfright) DIV 65536; r0 := ((col0 DIV 10000H MOD 100H) * xfleft + (col1 DIV 10000H MOD 100H) * xfright) DIV 65536; b1 := ((col2 MOD 100H) * xfleft + (col3 MOD 100H) * xfright) DIV 65536; g1 := ((col2 DIV 100H MOD 100H) * xfleft + (col3 DIV 100H MOD 100H) * xfright) DIV 65536; r1 := ((col2 DIV 10000H MOD 100H) * xfleft + (col3 DIV 10000H MOD 100H) * xfright) DIV 65536; cb := (b0 * yftop + b1 * yfbottom) DIV 65536; cg := (g0 * yftop + g1 * yfbottom) DIV 65536; cr := (r0 * yftop + r1 * yfbottom) DIV 65536; IF ca # 255 THEN cb := (cb * 256 + (256 - ca) * dstb) DIV 256; IF cb > 256 THEN cb := 256 END; cg := (cg * 256 + (256 - ca) * dstg) DIV 256; IF cg > 256 THEN cg := 256 END; cr := (cr * 256 + (256 - ca) * dstr) DIV 256; IF cr > 256 THEN cr := 256 END END; SYSTEM.PUT16(dstadr, ASH(cb, -3) + ASH(ASH(cg, -2), 5) + ASH(ASH(cr, -3), 11)) END; INC(fx, sdx); INC(dstadr, 2); END; INC(fy, sdy); INC(yadr, dstbpr) END END Q1BGRA8888BGR565; (* PROCEDURE SSE2Q1BGRA8888BGR565(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh:LONGINT); VAR x, y, z,xfleft, xfright, yftop, yfbottom : LONGINT; yadr, adr, col, col0, col1, col2, col3 : LONGINT; b0, g0, r0, a0, a01,b1, g1, r1, a1, cb, cg, cr,cb2, cg2, cr2, ca, ca2,dstb, dstg, dstr,res : LONGINT; fx, fy, yadd1, yadd2, xadd1, xadd2: LONGINT; CODE {SYSTEM.i386, SYSTEM.MMX, SYSTEM.SSE, SYSTEM.SSE2} PUSHFD PUSH EBX ; CLI PXOR MMX3,MMX3 PXOR MMX4,MMX4 PXOR MMX5, MMX5 PXOR MMX6, MMX6 PXOR XMM1, XMM1 PXOR XMM3, XMM3 PXOR XMM4, XMM4 PXOR XMM6, XMM6 PXOR XMM7, XMM7 MOV EDX, [EBP+dstadr] MOV EBX, [EBP+dl] SHL EBX, 1 ADD EDX, EBX MOV EBX, [EBP+dt] IMUL EBX, [EBP+dstbpr] ADD EDX, EBX MOV [EBP+yadr], EDX MOV EDX, [EBP+sy] SUB EDX, 8000H ;edx = sy-8000H MOV [EBP+fy], EDX ; sx := sx - 8000H; MOV EDX, [EBP+sx] SUB EDX, 8000H ;sx = sx-8000H MOV [EBP+sx] , EDX MOV ECX, [EBP+db] SUB ECX, [EBP+dt] ; counter in y JLE endyloop ;exit MOV [EBP+y], ECX outerloop: MOV EDX, [EBP+yadr] MOV EDI, EDX ; adr in EDI MOV [EBP+adr], EDX MOV EDX, [EBP+sx] ; keep EDX MOV [EBP+fx], EDX MOV EAX, [EBP+fy] MOVD XMM3, EAX ; prepare for top, bottom SAR EAX, 16 CMP EAX, 0 JE zero JL negativ MOV EBX, [EBP+sh] SUB EBX, 1 CMP EAX, EBX JGE bigger ok: MOV EBX, EAX ADD EBX, 1 JMP different zero: MOV EAX, 0 MOV EBX, 1 JMP different negativ: MOV EAX, 0 MOV EBX, 0 JMP samepixel bigger: MOV EAX, EBX JMP samepixel different: MOV ECX, [EBP+srcbpr] MUL EAX, ECX MOV EBX, EAX ADD EBX, ECX MOV ECX, [EBP+srcadr] ADD EAX, ECX ADD EBX, ECX JMP endyadd samepixel: MOV ECX, [EBP+srcbpr] MUL EAX, ECX MOV ECX, [EBP+srcadr] ADD EAX, ECX MOV EBX, EAX endyadd: MOV [EBP+yadd1], EAX MOV [EBP+yadd2], EBX ; yfbottom := (fy MOD 65536); ; yftop := (65536 - fy MOD 65536); MOVD ECX, XMM3 AND ECX, 0FFFFH MOV [EBP+yfbottom],ECX PINSRW XMM3, ECX, 1 NEG ECX ADD ECX, 65535 MOV [EBP+yftop],ECX PINSRW XMM3, ECX, 0 PSRLW XMM3, 1 MOV ECX, [EBP+dr] SUB ECX, [EBP+dl] ; counter in x JLE endyloop ;exit MOV [EBP+x], ECX innerloop: MOV ECX, [EBP+x] ; if x < 8 then do one pixel at the time CMP ECX, 8 JL singlepixel ; else ; take 8 at the time MOV EBX, EDI AND EBX, 0FH CMP EBX, 0 JNE singlepixel alleightpixels: MOV EAX, 0000000FFH MOVD MMX3, EAX ; dest red -> MMX4 MOV EAX, 0F800F800H MOVD MMX4, EAX ; dest green -> MMX5 MOV EAX, 07E007E0H MOVD MMX5, EAX ; dest blue -> MMX6 ; moved as MMX6 is used in singlepixel ; MOV EAX, 001F001FH ; MOVD MMX6, EAX MOV ECX, [EBP+yfbottom] PINSRW XMM3, ECX, 1 MOV ECX, [EBP+yftop] PINSRW XMM3, ECX, 0 PSRLW XMM3,1 PXOR XMM5, XMM5 PXOR XMM2,XMM2 MOV DWORD [EBP+z], 4 loop03: ; shift everything left MOV ECX, [EBP+fx] PSLLDQ XMM5, 4 PINSRW XMM7, ECX,0 ; prepare for l,r SAR ECX, 16 CMP ECX, 0 JE zerox03 JL negativx03 MOV EDX, [EBP+sw] SUB EDX, 1 CMP ECX, EDX JGE biggerx03 okx03: MOV EDX, ECX ADD EDX, 1 JMP endbound203 zerox03: MOV ECX, 0 MOV EDX, 1 JMP endbound203 negativx03: MOV ECX, 0 MOV EDX, 0 JMP endbound203 biggerx03: MOV ECX, EDX endbound203: SHL ECX, 2 ; xadd1 SHL EDX, 2 ; xadd2 MOV EAX, [EBP+yadd1] MOV EBX, [EBP+yadd2] MOVD XMM2, [EBX+EDX] PSLLDQ XMM2,4 MOVD XMM1, [EBX+ECX] POR XMM2,XMM1 PSLLDQ XMM2,4 MOVD XMM1, [EAX+EDX] POR XMM2,XMM1 PSLLDQ XMM2,4 MOVD XMM1, [EAX+ECX] POR XMM2,XMM1 PEXTRW EAX,XMM7,0 AND EAX, 0FFFFH PINSRW XMM7, EAX,1 PINSRW XMM7, EAX, 3 ;xfright NEG AX ADD EAX, 65535 PINSRW XMM7, EAX, 0 PINSRW XMM7, EAX, 2 ;xfleft PSRLW XMM7, 1 MOVDQU XMM0, XMM2 PSRLD XMM0, 24 PXOR XMM1, XMM1 MOV ECX, 0FFH ; ECX locked for ca PINSRW XMM1, ECX,0 PINSRW XMM1, ECX,2 PINSRW XMM1, ECX,4 PINSRW XMM1, ECX,6 PCMPEQW XMM1, XMM0 PMOVMSKB EAX, XMM1 CMP EAX, 0FFFFH JE endofalpha03 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW ECX, XMM0, 0 endofalpha03: ; alpha done CMP ECX,0 JE alphazero03 SHL ECX, 24 ; calculate red MOVDQU XMM0, XMM2 PSLLD XMM0, 8 PSRLD XMM0, 24 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW EBX, XMM0,0 SHL EBX,16 OR ECX,EBX ; red done ; calculate green MOVDQU XMM0, XMM2 PSLLD XMM0, 16 PSRLD XMM0, 24 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW EBX, XMM0,0 SHL EBX,8 OR ECX,EBX ; green done ; calculate blue MOVDQU XMM0, XMM2 PSLLD XMM0,24 PSRLD XMM0, 24 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0, XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW EBX, XMM0,0 OR ECX,EBX ; blue done ; put color in correct position MOVD XMM4,ECX POR XMM5, XMM4 ; results in XMM5 ; prepared source alphazero03: ; set mask is done later MOV ECX,[EBP+fx] ADD ECX, [EBP+sdx] MOV [EBP+fx],ECX SUB DWORD [EBP+z], 1 JNZ loop03 endofloop03: MOV DWORD [EBP+z], 4 loop47: ; shift everything left PSLLDQ XMM6, 4 PINSRW XMM7, ECX,0 ; prepare for l,r SAR ECX, 16 CMP ECX, 0 JE zerox47 JL negativx47 MOV EDX, [EBP+sw] SUB EDX, 1 CMP ECX, EDX JGE biggerx47 okx47: MOV EDX, ECX ADD EDX, 1 JMP endbound247 zerox47: MOV ECX, 0 MOV EDX, 1 JMP endbound247 negativx47: MOV ECX, 0 MOV EDX, 0 JMP endbound247 biggerx47: MOV ECX, EDX endbound247: SHL ECX, 2 ; xadd1 SHL EDX, 2 ; xadd2 MOV EAX, [EBP+yadd1] MOV EBX, [EBP+yadd2] MOVD XMM2, [EBX+EDX] PSLLDQ XMM2,4 MOVD XMM1, [EBX+ECX] POR XMM2,XMM1 PSLLDQ XMM2,4 MOVD XMM1, [EAX+EDX] POR XMM2,XMM1 PSLLDQ XMM2,4 MOVD XMM1, [EAX+ECX] POR XMM2,XMM1 PEXTRW EAX,XMM7,0 AND EAX, 0FFFFH PINSRW XMM7, EAX,1 PINSRW XMM7, EAX, 3 ;xfright NEG EAX ADD EAX, 65535 PINSRW XMM7, EAX, 0 PINSRW XMM7, EAX, 2 ;xfleft PSRLW XMM7, 1 MOVDQU XMM0, XMM2 PSRLD XMM0, 24 PXOR XMM1, XMM1 MOV ECX, 0FFH ; ECX locked for ca PINSRW XMM1, ECX,0 PINSRW XMM1, ECX,2 PINSRW XMM1, ECX,4 PINSRW XMM1, ECX,6 PCMPEQW XMM1, XMM0 PMOVMSKB EAX, XMM1 CMP EAX, 0FFFFH JE endofalpha47 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW ECX, XMM0, 0 endofalpha47: ; alpha done CMP ECX,0 JE alphazero47 SHL ECX, 24 ; calculate red MOVDQU XMM0, XMM2 PSLLD XMM0, 8 PSRLD XMM0, 24 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW EBX, XMM0,0 SHL EBX,16 OR ECX,EBX ; red done ; calculate green MOVDQU XMM0, XMM2 PSLLD XMM0, 16 PSRLD XMM0, 24 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW EBX, XMM0,0 SHL EBX,8 OR ECX,EBX ; green done ; calculate blue MOVDQU XMM0, XMM2 PSLLD XMM0,24 PSRLD XMM0, 24 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW EBX, XMM0,0 OR ECX,EBX ; blue done ; put color in correct position MOVD XMM4,ECX POR XMM6, XMM4 ; results in XMM6 ; prepared source alphazero47: ; set mask is done later MOV ECX,[EBP+fx] ADD ECX, [EBP+sdx] MOV [EBP+fx],ECX SUB DWORD [EBP+z], 1 JNZ loop47 endofloop47: ; all sources calculated, but in reversed order PSHUFD XMM2,XMM5, 1AH PSHUFD XMM1,XMM6, 1AH ; now sources ready for further calculation with destination ; get alphas MOVQ2DQ XMM4, MMX3 MOVDQU XMM6, XMM2 PSHUFD XMM4, XMM4, 0 MOVDQU XMM5, XMM1 PSLLD XMM4, 24 PAND XMM6, XMM4 ; alpha 5-8 in XMM6 PAND XMM5, XMM4 ; alpha 1-4 in XMM5 PSRLD XMM5, 24 PSHUFHW XMM5, XMM5, 85H PSRLD XMM6, 24 ; put both alphas into 1 register PSHUFHW XMM6, XMM6, 85H PSHUFLW XMM5, XMM5, 85H PSHUFLW XMM6, XMM6, 58H PSHUFD XMM5, XMM5, 0D0H ; 0102030400000000 PSHUFD XMM6, XMM6, 5CH ; 0000000005060708 PXOR XMM0,XMM0 POR XMM5, XMM6 ; XMM5 = alphas 0102030405060708 PCMPEQD XMM0, XMM5 PMOVMSKB EAX, XMM0 CMP EAX, 0FFFFH ; all alphas = zero; TEST not possible, because only 8 bits compared JE endloop ; mask out alpha = zero ; fd := 255-ORD(src[a]); fd = XMM4 ; MOV XMM4, 00FF00FF00FF00FF00FF00FF00FF00FFH PXOR XMM4, XMM4 MOV EAX, 00FFH PINSRW XMM4, EAX ,0 PSHUFLW XMM4, XMM4, 0 PSHUFD XMM4, XMM4, 0 PSUBW XMM4, XMM5 MOV EAX,1H PINSRW XMM3, EAX ,0 PSHUFLW XMM3, XMM3, 0 PSHUFD XMM3, XMM3, 0 PADDUSW XMM4, XMM3 ; new red ; calculate red 2 ; get source ; sred14 = src14 && (srcMask <<16) ; srcMask << 16 MOVQ2DQ XMM3, MMX3 PSHUFD XMM3, XMM3, 0 MOVDQU XMM5, XMM1 MOVDQU XMM6, XMM2 PSLLD XMM3, 16 ; sred14 = src14 && (srcMask << 24) ; src14 must be copied because it mustn't be changed PAND XMM5, XMM3 ; sred14 PSRLD XMM5, 16 ; sred14s = shuffled sred14 PSHUFHW XMM5, XMM5,85H PAND XMM6, XMM3 ; sred58 PSRLD XMM6, 16 PSHUFLW XMM5, XMM5,85H PSHUFHW XMM6, XMM6,85H PSHUFD XMM5, XMM5,0D0H ; sred14s PSHUFLW XMM6, XMM6,58H PSHUFD XMM6, XMM6,5CH ; sred58s POR XMM5, XMM6 ; sred18 ; sred18255 = sred18 * 256- sred18 MOVDQU XMM7, XMM5 PSLLW XMM5, 8 PSUBUSW XMM5, XMM7 ; sred18255 ; src is now ready ; destination ; dest18 must be copied because it mustn't be changed ; Load data into memory MOV EDI, [EBP+adr] MOVDQU XMM3, [EDI] ;dest 1-8 MOVQ2DQ XMM6, MMX4 PSHUFD XMM6, XMM6, 0 MOVDQU XMM7, XMM3 PAND XMM7, XMM6 ; dred18 PSRLW XMM7, 8 ; dred18alpha = dred18 * negalpha PMULLW XMM7, XMM4 ; dred18alpha ; dest is prepared ; combining dest and src ; dred18big = sred18255 + dred18alpha PADDUSW XMM7, XMM5 ; dred18big ; dred18f = dred18big && destMaskred128 because >> 11 and << 11 is && mask PAND XMM7, XMM6 ; dred18f ; dest18nr0 = dest18 && (~destMaskred128) PANDN XMM6, XMM3 ; dest18nr0 ; dest18nrf = dest18nr0 || dred18f POR XMM6, XMM7 MOVDQU XMM3, XMM6 ; red is calculated ; calculate green: ; get source ; sgreen14 = src14 && (srcMask <<8) ; srcMask << 8 MOVQ2DQ XMM7, MMX3 PSHUFD XMM7, XMM7, 0 MOVDQU XMM5, XMM1 PSLLD XMM7, 8 PAND XMM5, XMM7 ; sgreen14 PSRLD XMM5, 8 ; sgreen14s = shuffled sgreen14 PSHUFHW XMM5, XMM5,85H MOVDQU XMM6, XMM2 PSHUFLW XMM5, XMM5,85H PAND XMM6, XMM7 ; sgreen58 PSRLD XMM6, 8 PSHUFD XMM5, XMM5,0D0H ; sgreen14s ; sgreen58 = src58&& (srcMask << 8) ; src58 must be copied because it mustn't be changed ; sgreen58s = shuffled sgreen58 PSHUFHW XMM6, XMM6,85H PSHUFLW XMM6, XMM6,58H PSHUFD XMM6, XMM6,5CH ; sgreen58s ; sgreen18 = sgreen14s || sgreen58s POR XMM5, XMM6 ; sgreen18 ; sgreen18255 = sgreen18 * 256- sgreen18 MOVDQU XMM7, XMM5 MOVQ2DQ XMM6, MMX5 PSLLW XMM5, 8 PSUBUSW XMM5, XMM7 ; sgreen18255 PSHUFD XMM6, XMM6, 0 MOVDQU XMM7, XMM3 PAND XMM7, XMM6 ; dgreen18 PSRLW XMM7,3 ; dgreen18alpha = dgreen18 * negalpha PMULLW XMM7, XMM4 ; dgreen18alpha ; dest is prepared ; combining dest and src ; dgreen18big = sgreen18255 + dgreen18alpha PADDUSW XMM7, XMM5 ; dgreen18big PANDN XMM6, XMM3 ; dest18ng0 ; dgreen18f = (dgreen18big >> 11) <<5 PSRLW XMM7, 10 ; dgreen18f PSLLW XMM7, 5 ; dest18ng0 = dest18 && (~destMaskgreen128) ; dest18ngf = dest18ng0 || dred18f POR XMM6, XMM7 MOVDQU XMM3, XMM6 ; green is calculated ; calculate blue MOV EAX, 001F001FH MOVD MMX6, EAX ; get source ; sblue14 = src14 && (srcMask) ; srcMask MOVQ2DQ XMM7, MMX3 MOVDQU XMM5, XMM1 PSHUFD XMM7, XMM7, 0 MOVDQU XMM6, XMM2 ; sblue14 = src14 && (srcMask) ; src14 must be copied because it mustn't be changed PAND XMM5, XMM7 ; sblue14 ; sblue14s = shuffled sblue14 PSHUFHW XMM5, XMM5,85H PAND XMM6, XMM7 ; sblue58 PSHUFHW XMM6, XMM6,85H PSHUFLW XMM5, XMM5,85H PSHUFLW XMM6, XMM6,58H PSHUFD XMM5, XMM5,0D0H ; sblue14s PSHUFD XMM6, XMM6,5CH ; sblue58s POR XMM5, XMM6 ; sblue18 ; sblue18255 = sblue18 * 256- sblue18 MOVDQU XMM7, XMM5 PSLLW XMM5, 8 PSUBUSW XMM5, XMM7 ; sblue18255 MOVQ2DQ XMM6, MMX6 PSHUFD XMM6, XMM6, 0 MOVDQU XMM7, XMM3 PAND XMM7, XMM6 ; dblue18 PSLLW XMM7, 3 PMULLW XMM7, XMM4 ; dblue18alpha ; dest is prepared ; combining dest and src ; dblue18big = sblue18255 + dblue18alpha PADDUSW XMM7, XMM5 ; dblue18big ; dblue18f = (dblue18big >> 11) PANDN XMM6, XMM3 ; dest18nr0 PSRLW XMM7, 11 ; dblue18f ; dest18nr0 = dest18 && (~destMaskblue128) ; dest18nbf = dest18nb0 || dblue18f POR XMM6, XMM7 MOVDQU XMM3, XMM6 ; blue is calculated ; now dest is calculated, store it ; get 0 stuff MOVDQU XMM5, [EDI] PAND XMM5,XMM0 PANDN XMM0, XMM3 POR XMM0, XMM5 MOVDQU [EDI],XMM0 endloop: ;fx already inc ; by sdx ADD EDI, 16 MOV [EBP+adr],EDI SUB DWORD [EBP+x], 8 JNZ innerloop ; x>=0 JZ endxloop singlepixel: ; original code from MMXBGRA8888Over565, adjusted to fit this procedure MOV EDI, [EBP+adr] MOV EAX, 0000000FFH MOVD MMX3, EAX ; dest red -> MMX4 MOV EAX, 0F800F800H MOVD MMX4, EAX ; dest green -> MMX5 MOV EAX, 07E007E0H MOVD MMX5, EAX ; dest blue -> MMX6 ; moved as MMX6 is used in singlepixel ; MOV EAX, 001F001FH ; MOVD MMX6, EAX MOV ECX, [EBP+yfbottom] PINSRW XMM3, ECX, 1 MOV ECX, [EBP+yftop] PINSRW XMM3, ECX, 0 PSRLW XMM3,1 MOV ECX, [EBP+fx] PINSRW XMM7, ECX,0 ; prepare for l,r SAR ECX, 16 CMP ECX, 0 JE zerox JL negativx MOV EDX, [EBP+sw] SUB EDX, 1 CMP ECX, EDX JGE biggerx okx: MOV EDX, ECX ADD EDX, 1 JMP endbound2 zerox: MOV ECX, 0 MOV EDX, 1 JMP endbound2 negativx: MOV ECX, 0 MOV EDX, 0 JMP endbound2 biggerx: MOV ECX, EDX endbound2: SHL ECX, 2 ; xadd1 SHL EDX, 2 ; xadd2 MOV EAX, [EBP+yadd1] MOV EBX, [EBP+yadd2] MOVD XMM2, [EBX+EDX] PSLLDQ XMM2,4 MOVD XMM1, [EBX+ECX] POR XMM2,XMM1 PSLLDQ XMM2,4 MOVD XMM1, [EAX+EDX] POR XMM2,XMM1 PSLLDQ XMM2,4 MOVD XMM1, [EAX+ECX] POR XMM2,XMM1 PEXTRW EAX,XMM7,0 AND EAX, 0FFFFH PINSRW XMM7, EAX,1 PINSRW XMM7, EAX, 3 ;xfright NEG EAX ADD EAX, 65535 PINSRW XMM7, EAX, 0 PINSRW XMM7, EAX, 2 ;xfleft PSRLW XMM7, 1 MOVDQU XMM0, XMM2 PSRLD XMM0, 24 PXOR XMM1, XMM1 MOV ECX, 0FFH ; ECX locked for ca PINSRW XMM1, ECX,0 PINSRW XMM1, ECX,2 PINSRW XMM1, ECX,4 PINSRW XMM1, ECX,6 PCMPEQW XMM1, XMM0 PMOVMSKB EAX, XMM1 CMP EAX, 0FFFFH JE endofalpha PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW ECX, XMM0, 0 endofalpha: ; alpha done CMP ECX,0 JE alphazero ; calculate red MOVDQU XMM0, XMM2 PSLLD XMM0, 8 PSRLD XMM0, 24 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW EBX, XMM0,0 PINSRW XMM4, EBX, 4 ; red done ; calculate green MOVDQU XMM0, XMM2 PSLLD XMM0, 16 PSRLD XMM0, 24 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW EBX, XMM0,0 PINSRW XMM4, EBX, 2 ; green done ; calculate blue MOVDQU XMM0, XMM2 PSLLD XMM0,24 PSRLD XMM0, 24 PSHUFLW XMM0, XMM0,58H PSHUFHW XMM0, XMM0,58H PSHUFD XMM0,XMM0,58H PMADDWD XMM0,XMM7 PSRLD XMM0, 15 ; XMM7 already shifted by 1 PSHUFLW XMM0, XMM0, 58H PMADDWD XMM0, XMM3 PSRLD XMM0,15 ; XMM3 already shifted by 1 PEXTRW EBX, XMM0,0 PINSRW XMM4, EBX, 0 ; blue done ; prepared source CMP ECX, 0FFH ; ECX released JE alpha255 NEG ECX ADD ECX, 0FFH PINSRW XMM1, ECX, 1 ; 255-ca PINSRW XMM1, ECX, 3 ; 255-ca PINSRW XMM1, ECX, 5 ; 255-ca MOV EAX, 0FFH PINSRW XMM1, EAX, 0 ; 255 PINSRW XMM1, EAX, 2 ; 255 PINSRW XMM1, EAX, 4 ; 255 ;prepare destination MOV EBX, [EBP+adr] MOV EBX, [EBX] MOV EAX, EBX AND EAX, 01FH SHL EAX,3 PINSRW XMM4, EAX, 1 ; dstb MOV EAX, EBX AND EAX, 07E0H SHR EAX, 3 PINSRW XMM4, EAX, 3 ; dstg AND EBX, 0F800H SHR EBX,8 PINSRW XMM4, EBX, 5 ; dstr PMADDWD XMM4, XMM1 PSRLD XMM4, 8 PXOR XMM1,XMM1 PACKUSWB XMM4,XMM1 ; put results into their words PEXTRW EAX, XMM4, 2 ; end red PINSRW XMM4, EAX, 4 PEXTRW EAX, XMM4, 1 ; end green PINSRW XMM4, EAX, 2 alpha255: ; red in XMM4,4; green in XMM4, 2; blue in XMM4,0 ;SYSTEM.PUT16(adr, ASH(cb, -3) + ASH(ASH(cg, -2), 5) + ASH(ASH(cr, -3), 11)) PEXTRW EAX, XMM4, 0 ; end blue SHR EAX,3 AND EAX, 001FH PEXTRW EBX, XMM4, 2 ; end green SHL EBX,3 AND EBX, 07E0H OR EAX, EBX PEXTRW EBX, XMM4, 4 ; end red SHL EBX,8 AND EBX, 0F800H OR EAX, EBX MOV EDI,[EBP+adr] MOV [EDI], AX alphazero: ; alpha = 0, no writeback MOV ECX,[EBP+fx] ADD ECX, [EBP+sdx] MOV [EBP+fx],ECX MOV EDI,[EBP+adr] ADD EDI, 2 ; inc adr MOV [EBP+adr],EDI SUB DWORD [EBP+x], 1 JNZ innerloop endxloop: MOV EAX,[EBP+fy] ; fy := fy + sdy ADD EAX, [EBP+sdy] MOV [EBP+fy], EAX MOV EAX,[EBP+yadr] ADD EAX, [EBP+dstbpr] ;MOV EDI, EAX MOV [EBP+yadr], EAX SUB DWORD [EBP+y], 1 JNZ outerloop endyloop: EMMS ; declare FPU registers free POP EBX POPFD END SSE2Q1BGRA8888BGR565; *) PROCEDURE Q0BGRA8888BGR565(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR x, y : LONGINT; yadr, adr: ADDRESS; col, col0 : LONGINT; cb, cg, cr, ca, dstb, dstg, dstr: LONGINT; yadd: ADDRESS; fx, fy : LONGINT; BEGIN fy := sy; yadr := dstadr + dl * 2 + dt * dstbpr; FOR y := dt TO db - 1 DO fx := sx; adr := yadr; yadd := srcadr + (fy DIV 65536) * srcbpr; FOR x := dl TO dr - 1 DO (* destination color *) col := SYSTEM.GET16(adr); dstb := (col MOD 32) * 8; dstg := (col DIV 32 MOD 64) * 4; dstr := (col DIV 2048 MOD 32) * 8; col0 := SYSTEM.GET32(yadd + (fx DIV 65536) * 4); ca := (col0 DIV 1000000H MOD 100H); IF ca # 0 THEN cb := (col0 MOD 100H); cg := (col0 DIV 100H MOD 100H); cr := (col0 DIV 10000H MOD 100H); IF ca # 255 THEN cb := (cb * 256 + (256 - ca) * dstb) DIV 256; IF cb > 256 THEN cb := 256 END; cg := (cg * 256 + (256 - ca) * dstg) DIV 256; IF cg > 256 THEN cg := 256 END; cr := (cr * 256 + (256 - ca) * dstr) DIV 256; IF cr > 256 THEN cr := 256 END END; SYSTEM.PUT16(adr, ASH(cb, -3) + ASH(ASH(cg, -2), 5) + ASH(ASH(cr, -3), 11)) END; INC(fx, sdx); INC(adr, 2) END; INC(fy, sdy); INC(yadr, dstbpr) END END Q0BGRA8888BGR565; PROCEDURE Q0BGRA8888BGRA8888(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR x, y : LONGINT; yadr, adr: ADDRESS; col, col0 : LONGINT; cb, cg, cr, ca, dstb, dstg, dstr, dsta : LONGINT; yadd: ADDRESS; fx, fy : LONGINT; BEGIN fy := sy; yadr := dstadr + dl * 4 + dt * dstbpr; FOR y := dt TO db - 1 DO fx := sx; adr := yadr; yadd := srcadr + (fy DIV 65536) * srcbpr; FOR x := dl TO dr - 1 DO (* destination color *) col := SYSTEM.GET32(adr); dstb := (col MOD 100H); dstg := (col DIV 100H) MOD 100H; dstr := (col DIV 10000H) MOD 100H; dsta := (col DIV 1000000H) MOD 100H; col0 := SYSTEM.GET32(yadd + (fx DIV 65536) * 4); ca := (col0 DIV 1000000H MOD 100H); IF ca # 0 THEN cb := (col0 MOD 100H); cg := (col0 DIV 100H MOD 100H); cr := (col0 DIV 10000H MOD 100H); IF ca # 255 THEN cb := (cb * 256 + (256 - ca) * dstb) DIV 256; IF cb > 256 THEN cb := 256 END; cg := (cg * 256 + (256 - ca) * dstg) DIV 256; IF cg > 255 THEN cg := 256 END; cr := (cr * 256 + (256 - ca) * dstr) DIV 256; IF cr > 256 THEN cr := 256 END; ca := (ca * 256 + (256 - ca) * dsta) DIV 256; IF ca > 256 THEN ca := 256; END; END; SYSTEM.PUT32(adr, cb + LSH(cg, 8) + LSH(cr, 16) + LSH(ca, 24)); END; INC(fx, sdx); INC(adr, 4) END; INC(fy, sdy); INC(yadr, dstbpr) END END Q0BGRA8888BGRA8888; PROCEDURE Q0BGRA8888BGRA8888Copy(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR x, y : LONGINT; yadr, adr: ADDRESS; col, col0 : LONGINT; yadd : ADDRESS; fx, fy : LONGINT; BEGIN fy := sy; yadr := dstadr + dl * 4 + dt * dstbpr; FOR y := dt TO db - 1 DO fx := sx; adr := yadr; yadd := srcadr + (fy DIV 65536) * srcbpr; FOR x := dl TO dr - 1 DO col0 := SYSTEM.GET32(yadd + (fx DIV 65536) * 4); SYSTEM.PUT32(adr, col0); INC(fx, sdx); INC(adr, 4) END; INC(fy, sdy); INC(yadr, dstbpr) END END Q0BGRA8888BGRA8888Copy; PROCEDURE Q1BGRA8888BGRA8888(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr: ADDRESS; col, col0, col1, col2, col3 : LONGINT; b0, g0, r0, a0, b1, g1, r1, a1, cb, cg, cr, ca, dstb, dstg, dstr, dsta : LONGINT; fx, fy, xadd0, xadd1: LONGINT; yadd0, yadd1: ADDRESS; BEGIN yadr := dstadr + dl * 4 + dt * dstbpr; fy := sy - 8000H; sx := sx - 8000H; FOR y := dt TO db - 1 DO fx := sx; dstadr := yadr; yadd0 := srcadr + Bounds(fy DIV 65536, 0, sh - 1) * srcbpr; yadd1 := srcadr + Bounds(fy DIV 65536 + 1, 0, sh - 1) * srcbpr; FOR x := dl TO dr - 1 DO (* destination color *) col := SYSTEM.GET32(dstadr); dstb := col MOD 100H; dstg := col DIV 100H MOD 100H; dstr := col DIV 10000H MOD 100H; dsta := col DIV 1000000H MOD 100H; xadd0 := Bounds(fx DIV 65536, 0, sw - 1) * 4; xadd1 := Bounds(fx DIV 65536 + 1, 0, sw - 1) * 4; col0 := SYSTEM.GET32(yadd0 + xadd0); col1 := SYSTEM.GET32(yadd0 + xadd1); col2 := SYSTEM.GET32(yadd1 + xadd0); col3 := SYSTEM.GET32(yadd1 + xadd1); xfleft := (65536 - fx MOD 65536); xfright := (fx MOD 65536); yftop := (65536 - fy MOD 65536); yfbottom := (fy MOD 65536); a0 := ((col0 DIV 1000000H MOD 100H) * xfleft + (col1 DIV 1000000H MOD 100H) * xfright) DIV 65536; a1 := ((col2 DIV 1000000H MOD 100H) * xfleft + (col3 DIV 1000000H MOD 100H) * xfright) DIV 65536; ca := (a0 * yftop + a1 * yfbottom) DIV 65536; IF ca # 0 THEN b0 := ((col0 MOD 100H) * xfleft + (col1 MOD 100H) * xfright) DIV 65536; g0 := ((col0 DIV 100H MOD 100H) * xfleft + (col1 DIV 100H MOD 100H) * xfright) DIV 65536; r0 := ((col0 DIV 10000H MOD 100H) * xfleft + (col1 DIV 10000H MOD 100H) * xfright) DIV 65536; a0 := ((col0 DIV 1000000H MOD 100H) * xfleft + (col1 DIV 1000000H MOD 100H) * xfright) DIV 65536; b1 := ((col2 MOD 100H) * xfleft + (col3 MOD 100H) * xfright) DIV 65536; g1 := ((col2 DIV 100H MOD 100H) * xfleft + (col3 DIV 100H MOD 100H) * xfright) DIV 65536; r1 := ((col2 DIV 10000H MOD 100H) * xfleft + (col3 DIV 10000H MOD 100H) * xfright) DIV 65536; a1 := ((col2 DIV 1000000H MOD 100H) * xfleft + (col3 DIV 1000000H MOD 100H) * xfright) DIV 65536; cb := (b0 * yftop + b1 * yfbottom) DIV 65536; cg := (g0 * yftop + g1 * yfbottom) DIV 65536; cr := (r0 * yftop + r1 * yfbottom) DIV 65536; ca := (a0 * yftop + a1 * yfbottom) DIV 65536; IF ca # 255 THEN cb := (cb * 256 + (256 - ca) * dstb) DIV 256; IF cb > 256 THEN cb := 256 END; cg := (cg * 256 + (256 - ca) * dstg) DIV 256; IF cg > 256 THEN cg := 256 END; cr := (cr * 256 + (256 - ca) * dstr) DIV 256; IF cr > 256 THEN cr := 256 END; ca := (ca * 256 + (256 - ca) * dsta) DIV 256; IF ca > 256 THEN ca := 256; END; END; SYSTEM.PUT32(dstadr, cb + LSH(cg, 8) + LSH(cr, 16) + LSH(ca, 24)); END; INC(fx, sdx); INC(dstadr, 4); END; INC(fy, sdy); INC(yadr, dstbpr) END END Q1BGRA8888BGRA8888; PROCEDURE Q1BGRA8888BGRA8888Copy(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr: ADDRESS; col, col0, col1, col2, col3 : LONGINT; b0, g0, r0, a0, b1, g1, r1, a1, cb, cg, cr, ca : LONGINT; fx, fy, xadd0, xadd1: LONGINT; yadd0, yadd1: ADDRESS; BEGIN yadr := dstadr + dl * 4 + dt * dstbpr; fy := sy - 8000H; sx := sx - 8000H; FOR y := dt TO db - 1 DO fx := sx; dstadr := yadr; yadd0 := srcadr + Bounds(fy DIV 65536, 0, sh - 1) * srcbpr; yadd1 := srcadr + Bounds(fy DIV 65536 + 1, 0, sh - 1) * srcbpr; FOR x := dl TO dr - 1 DO (* destination color *) xadd0 := Bounds(fx DIV 65536, 0, sw - 1) * 4; xadd1 := Bounds(fx DIV 65536 + 1, 0, sw - 1) * 4; col0 := SYSTEM.GET32(yadd0 + xadd0); col1 := SYSTEM.GET32(yadd0 + xadd1); col2 := SYSTEM.GET32(yadd1 + xadd0); col3 := SYSTEM.GET32(yadd1 + xadd1); xfleft := (65536 - fx MOD 65536); xfright := (fx MOD 65536); yftop := (65536 - fy MOD 65536); yfbottom := (fy MOD 65536); a0 := ((col0 DIV 1000000H MOD 100H) * xfleft + (col1 DIV 1000000H MOD 100H) * xfright) DIV 65536; a1 := ((col2 DIV 1000000H MOD 100H) * xfleft + (col3 DIV 1000000H MOD 100H) * xfright) DIV 65536; ca := (a0 * yftop + a1 * yfbottom) DIV 65536; IF ca # 0 THEN b0 := ((col0 MOD 100H) * xfleft + (col1 MOD 100H) * xfright) DIV 65536; g0 := ((col0 DIV 100H MOD 100H) * xfleft + (col1 DIV 100H MOD 100H) * xfright) DIV 65536; r0 := ((col0 DIV 10000H MOD 100H) * xfleft + (col1 DIV 10000H MOD 100H) * xfright) DIV 65536; b1 := ((col2 MOD 100H) * xfleft + (col3 MOD 100H) * xfright) DIV 65536; g1 := ((col2 DIV 100H MOD 100H) * xfleft + (col3 DIV 100H MOD 100H) * xfright) DIV 65536; r1 := ((col2 DIV 10000H MOD 100H) * xfleft + (col3 DIV 10000H MOD 100H) * xfright) DIV 65536; cb := (b0 * yftop + b1 * yfbottom) DIV 65536; cg := (g0 * yftop + g1 * yfbottom) DIV 65536; cr := (r0 * yftop + r1 * yfbottom) DIV 65536; SYSTEM.PUT32(dstadr, cb + LSH(cg, 8) + LSH(cr, 16) + LSH(ca, 24)); END; INC(fx, sdx); INC(dstadr, 4); END; INC(fy, sdy); INC(yadr, dstbpr) END END Q1BGRA8888BGRA8888Copy; PROCEDURE SSE2Q0BGRA8888BGR565(srcadr, dstadr: ADDRESS; srcbpr, dstbpr, dl, dt, dr, db, sx, sy, sdx, sdy, sw, sh : LONGINT); VAR x, y, xfleft, xfright, yftop, yfbottom : LONGINT; yadr, adr, col, col0, col1, col2, col3 : LONGINT; cb, cg, cr, ca, dstb, dstg, dstr, yadd : LONGINT; fx, fy : LONGINT; w : LONGINT; END SSE2Q0BGRA8888BGR565; PROCEDURE Scale*(src : Image; sr : Rectangle; dst : Image; dr : Rectangle; clip : Rectangle; copyMode, scaleMode : LONGINT); VAR dw, dh, sw, sh : LONGINT; fw, fh : LONGREAL; sx, sy : LONGINT; scaler : ScalerProc; xscaler : XScalerProc; mode : Raster.Mode; SSE2enabled : BOOLEAN; BEGIN ASSERT((clip.l >= 0) & (clip.t >= 0) & (clip.r <= dst.width) & (clip.b <= dst.height)); ASSERT((sr.l >= 0) & (sr.t >= 0) & (sr.r <= src.width) & (sr.b <= src.height)); dw := dr.r - dr.l; dh := dr.b - dr.t; sw := sr.r - sr.l; sh := sr.b - sr.t; IF (sw = dw) & (sh = dh) THEN (* optimize special case *) IF ~Rect.IsContained(clip, dr) THEN IF dr.l < clip.l THEN DEC(dw, (clip.l - dr.l)); INC(sr.l, (clip.l - dr.l)); dr.l := clip.l END; IF dr.t < clip.t THEN DEC(dh, (clip.t - dr.t)); INC(sr.t, (clip.t - dr.t)); dr.t := clip.t END; IF dr.r > clip.r THEN DEC(dw, (dr.r - clip.r)) END; IF dr.b > clip.b THEN DEC(dh, (dr.b - clip.b)) END; END; IF (dw > 0) & (dh > 0) THEN IF copyMode = ModeCopy THEN Raster.InitMode(mode, Raster.srcCopy) ELSE Raster.InitMode(mode, Raster.srcOverDst) END; Raster.Copy(src, dst, sr.l, sr.t, sr.l + dw, sr.t + dh, dr.l, dr.t, mode) END; RETURN END; fw := sw / dw; fh := sh / dh; sx := sr.l * 65536; sy := sr.t * 65536; (* clipping *) IF ~Rect.IsContained(clip, dr) THEN sw := sr.r - sr.l; sh := sr.b - sr.t; dw := dr.r - dr.l; dh := dr.b - dr.t; IF dr.r > clip.r THEN dr.r := clip.r END; IF dr.b > clip.b THEN dr.b := clip.b END; IF dr.l < clip.l THEN sx := ENTIER(65536 * (sr.l + sw * (clip.l - dr.l) / dw)); dr.l := clip.l END; IF dr.t < clip.t THEN sy := ENTIER(65536 * (sr.t + sh * (clip.t - dr.t) / dh)); dr.t := clip.t END; END; IF Rect.RectEmpty(dr) THEN RETURN END; xscaler := NIL; SSE2enabled :=Raster.SSE2enabled; (*Machine.SSE2Support; *) (*IF SSE2enabled THEN IF (src.fmt.code = Raster.bgr565) & (dst.fmt.code = Raster.bgr565) THEN IF copyMode = ModeCopy THEN IF scaleMode = 0 THEN xscaler := SSE2Q0BGR565BGR565; ELSIF scaleMode = 1 THEN xscaler:= SSE2Q1BGR565BGR565; END; END; ELSIF (src.fmt.code = Raster.bgra8888) & (dst.fmt.code = Raster.bgr565) THEN IF copyMode = ModeSrcOverDst THEN IF scaleMode = 0 THEN xscaler := SSE2Q0BGRA8888BGR565; ELSIF scaleMode = 1 THEN xscaler := SSE2Q1BGRA8888BGR565; END; END; END; END;*) IF (xscaler = NIL) THEN IF (src.fmt.code = Raster.bgr565) & (dst.fmt.code = Raster.bgr565) THEN IF copyMode = ModeCopy THEN IF scaleMode = 0 THEN xscaler := XQ0BGR565BGR565; ELSIF scaleMode = 1 THEN xscaler := Q1BGR565BGR565; END; END; ELSIF (src.fmt.code = Raster.bgra8888) & (dst.fmt.code = Raster.bgr565) THEN IF copyMode = ModeSrcOverDst THEN IF scaleMode = 0 THEN xscaler := Q0BGRA8888BGR565; ELSIF scaleMode = 1 THEN xscaler := Q1BGRA8888BGR565; END; END; ELSIF (src.fmt.code = Raster.bgra8888) & (dst.fmt.code = Raster.bgra8888) THEN IF (copyMode = ModeSrcOverDst) THEN IF (scaleMode = 0) THEN xscaler := Q0BGRA8888BGRA8888; ELSIF (scaleMode = 1) THEN xscaler := Q1BGRA8888BGRA8888; END; ELSIF (copyMode = ModeCopy) THEN IF (scaleMode = 0) THEN xscaler := Q0BGRA8888BGRA8888Copy; ELSIF (scaleMode = 1) THEN xscaler := Q1BGRA8888BGRA8888Copy; END; END; END; END; IF xscaler # NIL THEN xscaler(src.adr, dst.adr, src.bpr, dst.bpr, dr.l, dr.t, dr.r, dr.b, sx, sy, ENTIER(fw * 65536), ENTIER(fh * 65536), src.width, src.height) ELSE scaler := Q0GenericSrcOverDst; (* fallback case *) IF copyMode = ModeCopy THEN IF scaleMode = 0 THEN scaler := Q0GenericCopy ELSIF scaleMode = 1 THEN scaler := Q1GenericCopy END ELSIF copyMode = ModeSrcOverDst THEN IF scaleMode = 0 THEN scaler := Q0GenericSrcOverDst ELSIF scaleMode = 1 THEN scaler := Q1GenericSrcOverDst END; END; scaler(src, dst, dr, sx, sy, ENTIER(fw * 65536), ENTIER(fh * 65536)); END; END Scale; PROCEDURE Bounds(val, min, max : LONGINT) : LONGINT; BEGIN IF val < min THEN RETURN min ELSIF val > max THEN RETURN max ELSE RETURN val END END Bounds; END WMRasterScale. SpeedTest.Mod