Browse Source

added support of array subtraction optimizations

git-svn-id: https://svn.inf.ethz.ch/svn/lecturers/a2/trunk@8668 8c9fc860-2736-0410-a75d-ab315db34111
eth.morozova 6 năm trước cách đây
mục cha
commit
3faf78e6aa

+ 342 - 0
source/AMD64.FoxArrayBaseOptimized.Mod

@@ -1460,6 +1460,344 @@ VAR
 		JMP	singlepieces	;
 		endL:
 	END AddARARLoopSSE;
+	
+(* *)
+	PROCEDURE SubAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.AMD64, SYSTEM.FPU}
+		MOV	RAX, [RBP+len]	;
+		MOV	RBX, [RBP+ladr]	;
+		MOV	RCX, [RBP+radr]	;
+		MOV	RDX, [RBP+dadr]	;
+		start:
+		CMP	RAX, 0	;
+		JLE	endL	;
+		FLD	QWORD [RBX]	;
+		ADD	RBX, [RBP+linc]	;
+		FLD	QWORD [RCX]	;
+		ADD	RCX, [RBP+rinc]	;
+		FSUBP	;
+		FSTP	QWORD [RDX]	;
+		ADD	RDX, [RBP+dinc]	;
+		DEC	RAX	;
+		JMP	start	;
+		endL:
+		FWAIT	;
+	END SubAXAXLoopA;
+
+	PROCEDURE SubARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.AMD64, SYSTEM.FPU}
+		MOV	RAX, [RBP+len]	;
+		MOV	RBX, [RBP+ladr]	;
+		MOV	RCX, [RBP+radr]	;
+		MOV	RDX, [RBP+dadr]	;
+		start:
+		CMP	RAX, 0	;
+		JLE	endL	;
+		FLD	DWORD [RBX]	;
+		ADD	RBX, [RBP+linc]	;
+		FLD	DWORD [RCX]	;
+		ADD	RCX, [RBP+rinc]	;
+		FSUBP	;
+		FSTP	DWORD [RDX]	;
+		ADD	RDX, [RBP+dinc]	;
+		DEC	RAX	;
+		JMP	start	;
+		endL:
+		FWAIT	;
+	END SubARARLoopA;
+
+	PROCEDURE SubAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.AMD64, SYSTEM.SSE2}
+		MOV	RAX, [RBP+len]	;
+		CMP	RAX, 0	;
+		JLE	endL	;  nothing TO be done, RAX > 0 guaranteed from here on
+		MOV	RBX, [RBP+ladr]	;
+		MOV	RCX, [RBP+radr]	;
+		MOV	RDX, [RBP+dadr]	;
+		;  check IF data are contiguous IN memory
+		CMP	[RBP+linc], 8	;  check left FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[RBP+rinc], 8	;  check right FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[RBP+dinc], 8	;  check destination FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		;  check FOR alignment
+		MOV	RSI, RBX	;
+		AND	RSI, 7	;  ladr MOD 8
+		CMP	RSI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	RSI, RCX	;
+		AND	RSI, 7	;  radr MOD 8
+		CMP	RSI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	RSI, RDX	;
+		AND	RSI, 7	;  dadr MOD 8
+		CMP	RSI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	RSI, RBX	;
+		AND	RSI, 8	;  16 byte alignment
+		MOV	RDI, RCX	;
+		AND	RDI, 8	;  16 byte alignment
+		CMP	RSI, RDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF ladr and radr
+		MOV	RDI, RDX	;
+		AND	RDI, 8	;  16 byte alignment
+		CMP	RSI, RDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF dadr and radr
+		CMP	RSI, 8	;
+		JNE	aligned	;  lad, radr and dadr already 128 bit aligned
+		;  one single element processing TO achieve 128 bt alignment
+		MOVSD	XMM1, [RBX]	;
+		MOVSD	XMM0, [RCX]	;
+		SUBSD	XMM0, XMM1	;
+		MOVSD	[RDX], XMM0	;
+		ADD	RBX, 8	;  now RBX IS 16 byte aligned
+		ADD	RCX, 8	;  now RDX IS 16 byte aligned	;
+		ADD	RDX, 8	;  now RDX IS 16 byte aligned	;
+		DEC	RAX	;  one element has been processed
+		aligned:
+		aligned8:
+		CMP	RAX, 8	;
+		JL	aligned2	;  len < 4- > EXIT TO singlepieces
+		MOVAPD	XMM0, [RBX]	;
+		MOVAPD	XMM1, [RBX+16]	;
+		MOVAPD	XMM2, [RBX+32]	;
+		MOVAPD	XMM3, [RBX+48]	;
+		ADD	RBX, 64	;
+		MOVAPD	XMM4, [RCX]	;
+		MOVAPD	XMM5, [RCX+16]	;
+		MOVAPD	XMM6, [RCX+32]	;
+		MOVAPD	XMM7, [RCX+48]	;
+		ADD	RCX, 64	;
+		SUBPD	XMM0, XMM4	;
+		SUBPD	XMM1, XMM5	;
+		SUBPD	XMM2, XMM6	;
+		SUBPD	XMM3, XMM7	;
+		MOVAPD	[RDX], XMM0	;
+		MOVAPD	[RDX+16], XMM1	;
+		MOVAPD	[RDX+32], XMM2	;
+		MOVAPD	[RDX+48], XMM3	;
+		ADD	RDX, 64	;
+		SUB	RAX, 8	;
+		JMP	aligned8	;
+		;  LOOP FOR 2 pieces aligned
+		aligned2: ;
+		CMP	RAX, 2	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVAPD	XMM0, [RBX]	;
+		ADD	RBX, 16	;
+		MOVAPD	XMM1, [RCX]	;
+		ADD	RCX, 16	;
+		SUBPD	XMM0, XMM1	;
+		MOVAPD	[RDX], XMM0	;
+		ADD	RDX, 16	;
+		SUB	RAX, 2	;
+		JMP	aligned2	;
+		;  LOOP FOR 8 unaligned pieces(14 pieces not better!)
+		unaligned: ;
+		unaligned8: ;
+		CMP	RAX, 8	;
+		JL	unaligned2	;  len < 4- > EXIT TO singlepieces
+		MOVUPD	XMM0, [RBX]	;
+		MOVUPD	XMM1, [RBX+16]	;
+		MOVUPD	XMM2, [RBX+32]	;
+		MOVUPD	XMM3, [RBX+48]	;
+		ADD	RBX, 64	;
+		MOVUPD	XMM4, [RCX]	;
+		MOVUPD	XMM5, [RCX+16]	;
+		MOVUPD	XMM6, [RCX+32]	;
+		MOVUPD	XMM7, [RCX+48]	;
+		ADD	RCX, 64	;
+		SUBPD	XMM0, XMM4	;
+		SUBPD	XMM1, XMM5	;
+		SUBPD	XMM2, XMM6	;
+		SUBPD	XMM3, XMM7	;
+		MOVUPD	[RDX], XMM0	;
+		MOVUPD	[RDX+16], XMM1	;
+		MOVUPD	[RDX+32], XMM2	;
+		MOVUPD	[RDX+48], XMM3	;
+		ADD	RDX, 64	;
+		SUB	RAX, 8	;
+		JMP	unaligned8	;
+		;  LOOP FOR 2 pieces aligned
+		unaligned2: ;
+		CMP	RAX, 2	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVUPD	XMM0, [RBX]	;
+		ADD	RBX, 16	;
+		MOVUPD	XMM1, [RCX]	;
+		ADD	RCX, 16	;
+		SUBPD	XMM0, XMM1	;
+		MOVUPD	[RDX], XMM0	;
+		ADD	RDX, 16	;
+		SUB	RAX, 2	;
+		JMP	unaligned2	;
+		;  one piece left OR non-contiguous data
+		single:
+		singlepieces: ;
+		CMP	RAX, 0	;
+		JLE	endL	;  len <= 0- > EXIT
+		MOVSD	XMM0, [RBX]
+		ADD	RBX, [RBP+linc]	;  INC(ladr, incl)
+		MOVSD	XMM1, [RCX]
+		ADD	RCX, [RBP+rinc]	;  INC(ladr, incl)
+		SUBSD	XMM0, XMM1	;
+		MOVSD	[RDX], XMM0
+		ADD	RDX, [RBP+dinc]	;  INC(radr, incr)
+		DEC	RAX	;  DEC(len)
+		JMP	singlepieces	;
+		endL:
+	END SubAXAXLoopSSE;
+
+	PROCEDURE SubARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.AMD64, SYSTEM.SSE2}
+		MOV	RAX, [RBP+len]	;
+		CMP	RAX, 0	;
+		JLE	endL	;  nothing TO be done, RAX > 0 guaranteed from here on
+		MOV	RBX, [RBP+ladr]	;
+		MOV	RCX, [RBP+radr]	;
+		MOV	RDX, [RBP+dadr]	;
+		;  check IF data are contiguous IN memory
+		CMP	[RBP+linc], 4	;  check left FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[RBP+rinc], 4	;  check right FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[RBP+dinc], 4	;  check destination FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		;  check FOR alignment
+		MOV	RSI, RBX	;
+		AND	RSI, 3	;  ladr MOD 4
+		CMP	RSI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	RSI, RCX	;
+		AND	RSI, 3	;  radr MOD 4
+		CMP	RSI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	RSI, RDX	;
+		AND	RSI, 3	;  dadr MOD 4
+		CMP	RSI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	RSI, RBX	;
+		AND	RSI, 8+4	;  16 byte alignment?
+		MOV	RDI, RCX	;
+		AND	RDI, 8+4	;  16 byte alignment?
+		CMP	RSI, RDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF ladr and radr
+		MOV	RDI, RDX	;
+		AND	RDI, 8+4	;  16 byte alignment
+		CMP	RSI, RDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF dadr and radr
+		CMP	RSI, 0	;
+		JE	aligned	;  already aligned
+		align:
+		;  one single element processing UNTIL 128 bt alignment achieved
+		MOVSS	XMM1, [RBX]	;
+		MOVSS	XMM0, [RCX]	;
+		SUBSS	XMM0, XMM1	;
+		MOVSS	[RDX], XMM0	;
+		ADD	RBX, 4	;
+		ADD	RCX, 4	;
+		ADD	RDX, 4	;
+		DEC	RAX	;  one element has been processed	;
+		CMP	RAX, 0	;  all elements already processed?
+		JLE	single	;
+		MOV	RSI, RBX	;
+		AND	RSI, 8+4	;
+		CMP	RSI, 0	;
+		JNE	align	;
+		aligned:
+		aligned16:
+		CMP	RAX, 16	;
+		JL	aligned4	;  len < 16- > EXIT TO singlepieces
+		MOVAPS	XMM0, [RBX]	;
+		MOVAPS	XMM1, [RBX+16]	;
+		MOVAPS	XMM2, [RBX+32]	;
+		MOVAPS	XMM3, [RBX+48]	;
+		ADD	RBX, 64	;
+		MOVAPS	XMM4, [RCX]	;
+		MOVAPS	XMM5, [RCX+16]	;
+		MOVAPS	XMM6, [RCX+32]	;
+		MOVAPS	XMM7, [RCX+48]	;
+		ADD	RCX, 64	;
+		SUBPS	XMM0, XMM4	;
+		SUBPS	XMM1, XMM5	;
+		SUBPS	XMM2, XMM6	;
+		SUBPS	XMM3, XMM7	;
+		MOVAPS	[RDX], XMM0	;
+		MOVAPS	[RDX+16], XMM1	;
+		MOVAPS	[RDX+32], XMM2	;
+		MOVAPS	[RDX+48], XMM3	;
+		ADD	RDX, 64	;
+		SUB	RAX, 16	;
+		JMP	aligned16	;
+		;  LOOP FOR 2 pieces aligned
+		aligned4: ;
+		CMP	RAX, 4	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVAPS	XMM0, [RBX]	;
+		ADD	RBX, 16	;
+		MOVAPS	XMM1, [RCX]	;
+		ADD	RCX, 16	;
+		SUBPS	XMM0, XMM1	;
+		MOVAPS	[RDX], XMM0	;
+		ADD	RDX, 16	;
+		SUB	RAX, 4	;
+		JMP	aligned4	;
+		;  LOOP FOR 8 unaligned pieces(14 pieces not better!)
+		unaligned: ;
+		unaligned16: ;
+		CMP	RAX, 16	;
+		JL	unaligned4	;  len < 4- > EXIT TO singlepieces
+		MOVUPS	XMM0, [RBX]	;
+		MOVUPS	XMM1, [RBX+16]	;
+		MOVUPS	XMM2, [RBX+32]	;
+		MOVUPS	XMM3, [RBX+48]	;
+		ADD	RBX, 64	;
+		MOVUPS	XMM4, [RCX]	;
+		MOVUPS	XMM5, [RCX+16]	;
+		MOVUPS	XMM6, [RCX+32]	;
+		MOVUPS	XMM7, [RCX+48]	;
+		ADD	RCX, 64	;
+		SUBPS	XMM0, XMM4	;
+		SUBPS	XMM1, XMM5	;
+		SUBPS	XMM2, XMM6	;
+		SUBPS	XMM3, XMM7	;
+		MOVUPS	[RDX], XMM0	;
+		MOVUPS	[RDX+16], XMM1	;
+		MOVUPS	[RDX+32], XMM2	;
+		MOVUPS	[RDX+48], XMM3	;
+		ADD	RDX, 64	;
+		SUB	RAX, 16	;
+		JMP	unaligned16	;
+		;  LOOP FOR 2 pieces aligned
+		unaligned4: ;
+		CMP	RAX, 4	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVUPS	XMM0, [RBX]	;
+		ADD	RBX, 16	;
+		MOVUPS	XMM1, [RCX]	;
+		ADD	RCX, 16	;
+		SUBPS	XMM0, XMM1	;
+		MOVUPS	[RDX], XMM0	;
+		ADD	RDX, 16	;
+		SUB	RAX, 4	;
+		JMP	unaligned4	;
+		;  one piece left OR non-contiguous data
+		single:
+		singlepieces: ;
+		CMP	RAX, 0	;
+		JLE	endL	;  len <= 0- > EXIT
+		MOVSS	XMM0, [RBX]
+		ADD	RBX, [RBP+linc]	;  INC(ladr, incl)
+		MOVSS	XMM1, [RCX]
+		ADD	RCX, [RBP+rinc]	;  INC(ladr, incl)
+		SUBSS	XMM0, XMM1	;
+		MOVSS	[RDX], XMM0
+		ADD	RDX, [RBP+dinc]	;  INC(radr, incr)
+		DEC	RAX	;  DEC(len)
+		JMP	singlepieces	;
+		endL:
+	END SubARARLoopSSE;
 
 	PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
 	CODE {SYSTEM.AMD64, SYSTEM.FPU}
@@ -6435,6 +6773,8 @@ VAR
 		ArrayBase.loopSPARAR := SPARARLoopA;
 		ArrayBase.loopAddAXAX := AddAXAXLoopA;
 		ArrayBase.loopAddARAR := AddARARLoopA;
+		ArrayBase.loopSubAXAX := SubAXAXLoopA;
+		ArrayBase.loopSubARAR := SubARARLoopA;
 		ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
 		ArrayBase.loopMatMulARAR := MatMulARARLoopA;
 		ArrayBase.loopMulAXSX := MulAXSXLoopA;
@@ -6453,6 +6793,7 @@ VAR
 			KernelLog.String( "SSE " );
 			ArrayBase.loopSPARAR := SPARARLoopSSE;
 			ArrayBase.loopAddARAR := AddARARLoopSSE;
+			ArrayBase.loopSubARAR := SubARARLoopSSE;
 			ArrayBase.loopMulARSR := MulARSRLoopSSE;
 			ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
 			ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
@@ -6476,6 +6817,7 @@ VAR
 			KernelLog.String( "SSE2 " );
 			ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
 			ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
+			ArrayBase.loopSubAXAX := SubAXAXLoopSSE;
 			ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
 			ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
 			ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;

+ 8 - 7
source/FoxArrayBase.Mod

@@ -154,6 +154,7 @@ VAR
 	loopSPAXAX*, loopSPARAR*: BinaryAASLoop;
 	loopSPAZAZ, loopSPALZALZ: BinaryAASLoop;
 	loopAddAXAX*, loopAddARAR*, loopAddAZAZ*, loopAddALZALZ*: BinaryAAALoop;
+	loopSubAXAX*, loopSubARAR*, loopSubAZAZ*, loopSubALZALZ*: BinaryAAALoop;
 	loopMatMulAXAX*, loopMatMulARAR*: BinaryAASLoop;
 	loopMatMulIncAXAX*, loopMatMulIncARAR*: BinaryAASLoop;
 	loopMulAXSX*, loopMulARSR*, loopMulAZSZ*, loopMulALZSLZ*: BinaryASALoop;
@@ -185,10 +186,10 @@ VAR
 	PROCEDURE SetDefaults*;   (* set standard procedures *)
 	BEGIN
 		KernelLog.String( "ArrayBase XXXXXXX: setting runtime library (semi-optimized) default methods." );  KernelLog.Ln;  loopSPAXAX := SPAXAXLoop;
-		loopSPARAR := SPARARLoop;  loopAddAXAX := AddAXAXLoop;
+		loopSPARAR := SPARARLoop;  loopAddAXAX := AddAXAXLoop;  loopSubAXAX := SubAXAXLoop;
 		loopSPAZAZ := SPAZAZLoop; loopSPALZALZ := SPALZALZLoop;
-		loopAddARAR := AddARARLoop;  loopMatMulAXAX := MatMulAXAXLoop;
-		loopAddAZAZ := AddAZAZLoop; loopAddALZALZ := AddALZALZLoop;
+		loopAddARAR := AddARARLoop; loopSubARAR := SubARARLoop;  loopMatMulAXAX := MatMulAXAXLoop;
+		loopAddAZAZ := AddAZAZLoop; loopAddALZALZ := AddALZALZLoop; loopSubAZAZ := SubAZAZLoop; loopSubALZALZ := SubALZALZLoop;
 		loopMatMulIncAXAX := MatMulIncAXAXLoop;
 		loopMatMulARAR := MatMulARARLoop;  loopMulAXSX := MulAXSXLoop;
 		loopIncMulAXSX := IncMulAXSXLoop;
@@ -2673,7 +2674,7 @@ Sufficient (but not necessary) conditions:
 	OPERATOR "-"*(CONST left,right: ARRAY [?] OF REAL): ARRAY {UNSAFE} [?] OF REAL;
 	BEGIN
 		ApplyBinaryAAAOp( RESULT, left, right, SIZEOF( REAL ),
-										  SubARARLoop );
+										  loopSubARAR );
 		RETURN RESULT
 	END "-";
 
@@ -2690,7 +2691,7 @@ Sufficient (but not necessary) conditions:
 	OPERATOR "-"*(CONST left,right: ARRAY [?] OF LONGREAL): ARRAY {UNSAFE} [?] OF LONGREAL;
 	BEGIN
 		ApplyBinaryAAAOp( RESULT, left, right,
-										  SIZEOF( LONGREAL ), SubAXAXLoop );
+										  SIZEOF( LONGREAL ), loopSubAXAX );
 		RETURN RESULT
 	END "-";
 
@@ -2707,7 +2708,7 @@ Sufficient (but not necessary) conditions:
 	OPERATOR "-"*(CONST left,right: ARRAY [?] OF COMPLEX): ARRAY {UNSAFE} [?] OF COMPLEX;
 	BEGIN
 		ApplyBinaryAAAOp( RESULT, left, right,
-										  SIZEOF( COMPLEX ), SubAZAZLoop );
+										  SIZEOF( COMPLEX ), loopSubAZAZ );
 		RETURN RESULT
 	END "-";
 
@@ -2727,7 +2728,7 @@ Sufficient (but not necessary) conditions:
 	OPERATOR "-"*(CONST left,right: ARRAY [?] OF LONGCOMPLEX): ARRAY {UNSAFE} [?] OF LONGCOMPLEX;
 	BEGIN
 		ApplyBinaryAAAOp( RESULT, left, right,
-										  SIZEOF( LONGCOMPLEX ), SubALZALZLoop );
+										  SIZEOF( LONGCOMPLEX ), loopSubALZALZ );
 		RETURN RESULT
 	END "-";
 

+ 342 - 0
source/I386.FoxArrayBaseOptimized.Mod

@@ -1461,6 +1461,344 @@ VAR
 		endL:
 	END AddARARLoopSSE;
 
+(* *)
+	PROCEDURE SubAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.i386, SYSTEM.FPU}
+		MOV	EAX, [EBP+len]	;
+		MOV	EBX, [EBP+ladr]	;
+		MOV	ECX, [EBP+radr]	;
+		MOV	EDX, [EBP+dadr]	;
+		start:
+		CMP	EAX, 0	;
+		JLE	endL	;
+		FLD	QWORD [EBX]	;
+		ADD	EBX, [EBP+linc]	;
+		FLD	QWORD [ECX]	;
+		ADD	ECX, [EBP+rinc]	;
+		FSUBP	;
+		FSTP	QWORD [EDX]	;
+		ADD	EDX, [EBP+dinc]	;
+		DEC	EAX	;
+		JMP	start	;
+		endL:
+		FWAIT	;
+	END SubAXAXLoopA;
+
+	PROCEDURE SubARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.i386, SYSTEM.FPU}
+		MOV	EAX, [EBP+len]	;
+		MOV	EBX, [EBP+ladr]	;
+		MOV	ECX, [EBP+radr]	;
+		MOV	EDX, [EBP+dadr]	;
+		start:
+		CMP	EAX, 0	;
+		JLE	endL	;
+		FLD	DWORD [EBX]	;
+		ADD	EBX, [EBP+linc]	;
+		FLD	DWORD [ECX]	;
+		ADD	ECX, [EBP+rinc]	;
+		FSUBP	;
+		FSTP	DWORD [EDX]	;
+		ADD	EDX, [EBP+dinc]	;
+		DEC	EAX	;
+		JMP	start	;
+		endL:
+		FWAIT	;
+	END SubARARLoopA;
+
+	PROCEDURE SubAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.i386, SYSTEM.SSE2}
+		MOV	EAX, [EBP+len]	;
+		CMP	EAX, 0	;
+		JLE	endL	;  nothing TO be done, EAX > 0 guaranteed from here on
+		MOV	EBX, [EBP+ladr]	;
+		MOV	ECX, [EBP+radr]	;
+		MOV	EDX, [EBP+dadr]	;
+		;  check IF data are contiguous IN memory
+		CMP	[EBP+linc], 8	;  check left FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[EBP+rinc], 8	;  check right FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[EBP+dinc], 8	;  check destination FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		;  check FOR alignment
+		MOV	ESI, EBX	;
+		AND	ESI, 7	;  ladr MOD 8
+		CMP	ESI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	ESI, ECX	;
+		AND	ESI, 7	;  radr MOD 8
+		CMP	ESI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	ESI, EDX	;
+		AND	ESI, 7	;  dadr MOD 8
+		CMP	ESI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	ESI, EBX	;
+		AND	ESI, 8	;  16 byte alignment
+		MOV	EDI, ECX	;
+		AND	EDI, 8	;  16 byte alignment
+		CMP	ESI, EDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF ladr and radr
+		MOV	EDI, EDX	;
+		AND	EDI, 8	;  16 byte alignment
+		CMP	ESI, EDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF dadr and radr
+		CMP	ESI, 8	;
+		JNE	aligned	;  lad, radr and dadr already 128 bit aligned
+		;  one single element processing TO achieve 128 bt alignment
+		MOVSD	XMM1, [EBX]	;
+		MOVSD	XMM0, [ECX]	;
+		SUBSD	XMM0, XMM1	;
+		MOVSD	[EDX], XMM0	;
+		ADD	EBX, 8	;  now EBX IS 16 byte aligned
+		ADD	ECX, 8	;  now EDX IS 16 byte aligned	;
+		ADD	EDX, 8	;  now EDX IS 16 byte aligned	;
+		DEC	EAX	;  one element has been processed
+		aligned:
+		aligned8:
+		CMP	EAX, 8	;
+		JL	aligned2	;  len < 4- > EXIT TO singlepieces
+		MOVAPD	XMM0, [EBX]	;
+		MOVAPD	XMM1, [EBX+16]	;
+		MOVAPD	XMM2, [EBX+32]	;
+		MOVAPD	XMM3, [EBX+48]	;
+		ADD	EBX, 64	;
+		MOVAPD	XMM4, [ECX]	;
+		MOVAPD	XMM5, [ECX+16]	;
+		MOVAPD	XMM6, [ECX+32]	;
+		MOVAPD	XMM7, [ECX+48]	;
+		ADD	ECX, 64	;
+		SUBPD	XMM0, XMM4	;
+		SUBPD	XMM1, XMM5	;
+		SUBPD	XMM2, XMM6	;
+		SUBPD	XMM3, XMM7	;
+		MOVAPD	[EDX], XMM0	;
+		MOVAPD	[EDX+16], XMM1	;
+		MOVAPD	[EDX+32], XMM2	;
+		MOVAPD	[EDX+48], XMM3	;
+		ADD	EDX, 64	;
+		SUB	EAX, 8	;
+		JMP	aligned8	;
+		;  LOOP FOR 2 pieces aligned
+		aligned2: ;
+		CMP	EAX, 2	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVAPD	XMM0, [EBX]	;
+		ADD	EBX, 16	;
+		MOVAPD	XMM1, [ECX]	;
+		ADD	ECX, 16	;
+		SUBPD	XMM0, XMM1	;
+		MOVAPD	[EDX], XMM0	;
+		ADD	EDX, 16	;
+		SUB	EAX, 2	;
+		JMP	aligned2	;
+		;  LOOP FOR 8 unaligned pieces(14 pieces not better!)
+		unaligned: ;
+		unaligned8: ;
+		CMP	EAX, 8	;
+		JL	unaligned2	;  len < 4- > EXIT TO singlepieces
+		MOVUPD	XMM0, [EBX]	;
+		MOVUPD	XMM1, [EBX+16]	;
+		MOVUPD	XMM2, [EBX+32]	;
+		MOVUPD	XMM3, [EBX+48]	;
+		ADD	EBX, 64	;
+		MOVUPD	XMM4, [ECX]	;
+		MOVUPD	XMM5, [ECX+16]	;
+		MOVUPD	XMM6, [ECX+32]	;
+		MOVUPD	XMM7, [ECX+48]	;
+		ADD	ECX, 64	;
+		SUBPD	XMM0, XMM4	;
+		SUBPD	XMM1, XMM5	;
+		SUBPD	XMM2, XMM6	;
+		SUBPD	XMM3, XMM7	;
+		MOVUPD	[EDX], XMM0	;
+		MOVUPD	[EDX+16], XMM1	;
+		MOVUPD	[EDX+32], XMM2	;
+		MOVUPD	[EDX+48], XMM3	;
+		ADD	EDX, 64	;
+		SUB	EAX, 8	;
+		JMP	unaligned8	;
+		;  LOOP FOR 2 pieces aligned
+		unaligned2: ;
+		CMP	EAX, 2	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVUPD	XMM0, [EBX]	;
+		ADD	EBX, 16	;
+		MOVUPD	XMM1, [ECX]	;
+		ADD	ECX, 16	;
+		SUBPD	XMM0, XMM1	;
+		MOVUPD	[EDX], XMM0	;
+		ADD	EDX, 16	;
+		SUB	EAX, 2	;
+		JMP	unaligned2	;
+		;  one piece left OR non-contiguous data
+		single:
+		singlepieces: ;
+		CMP	EAX, 0	;
+		JLE	endL	;  len <= 0- > EXIT
+		MOVSD	XMM0, [EBX]
+		ADD	EBX, [EBP+linc]	;  INC(ladr, incl)
+		MOVSD	XMM1, [ECX]
+		ADD	ECX, [EBP+rinc]	;  INC(ladr, incl)
+		SUBSD	XMM0, XMM1	;
+		MOVSD	[EDX], XMM0
+		ADD	EDX, [EBP+dinc]	;  INC(radr, incr)
+		DEC	EAX	;  DEC(len)
+		JMP	singlepieces	;
+		endL:
+	END SubAXAXLoopSSE;
+
+	PROCEDURE SubARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.i386, SYSTEM.SSE2}
+		MOV	EAX, [EBP+len]	;
+		CMP	EAX, 0	;
+		JLE	endL	;  nothing TO be done, EAX > 0 guaranteed from here on
+		MOV	EBX, [EBP+ladr]	;
+		MOV	ECX, [EBP+radr]	;
+		MOV	EDX, [EBP+dadr]	;
+		;  check IF data are contiguous IN memory
+		CMP	[EBP+linc], 4	;  check left FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[EBP+rinc], 4	;  check right FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[EBP+dinc], 4	;  check destination FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		;  check FOR alignment
+		MOV	ESI, EBX	;
+		AND	ESI, 3	;  ladr MOD 4
+		CMP	ESI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	ESI, ECX	;
+		AND	ESI, 3	;  radr MOD 4
+		CMP	ESI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	ESI, EDX	;
+		AND	ESI, 3	;  dadr MOD 4
+		CMP	ESI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	ESI, EBX	;
+		AND	ESI, 8+4	;  16 byte alignment?
+		MOV	EDI, ECX	;
+		AND	EDI, 8+4	;  16 byte alignment?
+		CMP	ESI, EDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF ladr and radr
+		MOV	EDI, EDX	;
+		AND	EDI, 8+4	;  16 byte alignment
+		CMP	ESI, EDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF dadr and radr
+		CMP	ESI, 0	;
+		JE	aligned	;  already aligned
+		align:
+		;  one single element processing UNTIL 128 bt alignment achieved
+		MOVSS	XMM1, [EBX]	;
+		MOVSS	XMM0, [ECX]	;
+		SUBSS	XMM0, XMM1	;
+		MOVSS	[EDX], XMM0	;
+		ADD	EBX, 4	;
+		ADD	ECX, 4	;
+		ADD	EDX, 4	;
+		DEC	EAX	;  one element has been processed	;
+		CMP	EAX, 0	;  all elements already processed?
+		JLE	single	;
+		MOV	ESI, EBX	;
+		AND	ESI, 8+4	;
+		CMP	ESI, 0	;
+		JNE	align	;
+		aligned:
+		aligned16:
+		CMP	EAX, 16	;
+		JL	aligned4	;  len < 16- > EXIT TO singlepieces
+		MOVAPS	XMM0, [EBX]	;
+		MOVAPS	XMM1, [EBX+16]	;
+		MOVAPS	XMM2, [EBX+32]	;
+		MOVAPS	XMM3, [EBX+48]	;
+		ADD	EBX, 64	;
+		MOVAPS	XMM4, [ECX]	;
+		MOVAPS	XMM5, [ECX+16]	;
+		MOVAPS	XMM6, [ECX+32]	;
+		MOVAPS	XMM7, [ECX+48]	;
+		ADD	ECX, 64	;
+		SUBPS	XMM0, XMM4	;
+		SUBPS	XMM1, XMM5	;
+		SUBPS	XMM2, XMM6	;
+		SUBPS	XMM3, XMM7	;
+		MOVAPS	[EDX], XMM0	;
+		MOVAPS	[EDX+16], XMM1	;
+		MOVAPS	[EDX+32], XMM2	;
+		MOVAPS	[EDX+48], XMM3	;
+		ADD	EDX, 64	;
+		SUB	EAX, 16	;
+		JMP	aligned16	;
+		;  LOOP FOR 2 pieces aligned
+		aligned4: ;
+		CMP	EAX, 4	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVAPS	XMM0, [EBX]	;
+		ADD	EBX, 16	;
+		MOVAPS	XMM1, [ECX]	;
+		ADD	ECX, 16	;
+		SUBPS	XMM0, XMM1	;
+		MOVAPS	[EDX], XMM0	;
+		ADD	EDX, 16	;
+		SUB	EAX, 4	;
+		JMP	aligned4	;
+		;  LOOP FOR 8 unaligned pieces(14 pieces not better!)
+		unaligned: ;
+		unaligned16: ;
+		CMP	EAX, 16	;
+		JL	unaligned4	;  len < 4- > EXIT TO singlepieces
+		MOVUPS	XMM0, [EBX]	;
+		MOVUPS	XMM1, [EBX+16]	;
+		MOVUPS	XMM2, [EBX+32]	;
+		MOVUPS	XMM3, [EBX+48]	;
+		ADD	EBX, 64	;
+		MOVUPS	XMM4, [ECX]	;
+		MOVUPS	XMM5, [ECX+16]	;
+		MOVUPS	XMM6, [ECX+32]	;
+		MOVUPS	XMM7, [ECX+48]	;
+		ADD	ECX, 64	;
+		SUBPS	XMM0, XMM4	;
+		SUBPS	XMM1, XMM5	;
+		SUBPS	XMM2, XMM6	;
+		SUBPS	XMM3, XMM7	;
+		MOVUPS	[EDX], XMM0	;
+		MOVUPS	[EDX+16], XMM1	;
+		MOVUPS	[EDX+32], XMM2	;
+		MOVUPS	[EDX+48], XMM3	;
+		ADD	EDX, 64	;
+		SUB	EAX, 16	;
+		JMP	unaligned16	;
+		;  LOOP FOR 2 pieces aligned
+		unaligned4: ;
+		CMP	EAX, 4	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVUPS	XMM0, [EBX]	;
+		ADD	EBX, 16	;
+		MOVUPS	XMM1, [ECX]	;
+		ADD	ECX, 16	;
+		SUBPS	XMM0, XMM1	;
+		MOVUPS	[EDX], XMM0	;
+		ADD	EDX, 16	;
+		SUB	EAX, 4	;
+		JMP	unaligned4	;
+		;  one piece left OR non-contiguous data
+		single:
+		singlepieces: ;
+		CMP	EAX, 0	;
+		JLE	endL	;  len <= 0- > EXIT
+		MOVSS	XMM0, [EBX]
+		ADD	EBX, [EBP+linc]	;  INC(ladr, incl)
+		MOVSS	XMM1, [ECX]
+		ADD	ECX, [EBP+rinc]	;  INC(ladr, incl)
+		SUBSS	XMM0, XMM1	;
+		MOVSS	[EDX], XMM0
+		ADD	EDX, [EBP+dinc]	;  INC(radr, incr)
+		DEC	EAX	;  DEC(len)
+		JMP	singlepieces	;
+		endL:
+	END SubARARLoopSSE;
+
 	PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
 	CODE {SYSTEM.i386, SYSTEM.FPU}
 		MOV	EAX, [EBP+len]	;  eax := len
@@ -6437,6 +6775,8 @@ VAR
 		ArrayBase.loopSPARAR := SPARARLoopA;
 		ArrayBase.loopAddAXAX := AddAXAXLoopA;
 		ArrayBase.loopAddARAR := AddARARLoopA;
+		ArrayBase.loopSubAXAX := SubAXAXLoopA;
+		ArrayBase.loopSubARAR := SubARARLoopA;
 		ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
 		ArrayBase.loopMatMulARAR := MatMulARARLoopA;
 		ArrayBase.loopMulAXSX := MulAXSXLoopA;
@@ -6455,6 +6795,7 @@ VAR
 			KernelLog.String( "SSE " );
 			ArrayBase.loopSPARAR := SPARARLoopSSE;
 			ArrayBase.loopAddARAR := AddARARLoopSSE;
+			ArrayBase.loopSubARAR := SubARARLoopSSE;
 			ArrayBase.loopMulARSR := MulARSRLoopSSE;
 			ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
 			ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
@@ -6478,6 +6819,7 @@ VAR
 			KernelLog.String( "SSE2 " );
 			ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
 			ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
+			ArrayBase.loopSubAXAX := SubAXAXLoopSSE;
 			ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
 			ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
 			ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;