Browse Source

added support of elementwise array multiplication optimizations

git-svn-id: https://svn.inf.ethz.ch/svn/lecturers/a2/trunk@8669 8c9fc860-2736-0410-a75d-ab315db34111
eth.morozova 6 years ago
parent
commit
71fcfe6c5b
3 changed files with 693 additions and 7 deletions
  1. 342 0
      source/AMD64.FoxArrayBaseOptimized.Mod
  2. 9 7
      source/FoxArrayBase.Mod
  3. 342 0
      source/I386.FoxArrayBaseOptimized.Mod

+ 342 - 0
source/AMD64.FoxArrayBaseOptimized.Mod

@@ -1798,6 +1798,344 @@ VAR
 		JMP	singlepieces	;
 		endL:
 	END SubARARLoopSSE;
+	
+	(* *)
+	PROCEDURE EMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.AMD64, SYSTEM.FPU}
+		MOV	RAX, [RBP+len]	;
+		MOV	RBX, [RBP+ladr]	;
+		MOV	RCX, [RBP+radr]	;
+		MOV	RDX, [RBP+dadr]	;
+		start:
+		CMP	RAX, 0	;
+		JLE	endL	;
+		FLD	QWORD [RBX]	;
+		ADD	RBX, [RBP+linc]	;
+		FLD	QWORD [RCX]	;
+		ADD	RCX, [RBP+rinc]	;
+		FMULP	;
+		FSTP	QWORD [RDX]	;
+		ADD	RDX, [RBP+dinc]	;
+		DEC	RAX	;
+		JMP	start	;
+		endL:
+		FWAIT	;
+	END EMulAXAXLoopA;
+
+	PROCEDURE EMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.AMD64, SYSTEM.FPU}
+		MOV	RAX, [RBP+len]	;
+		MOV	RBX, [RBP+ladr]	;
+		MOV	RCX, [RBP+radr]	;
+		MOV	RDX, [RBP+dadr]	;
+		start:
+		CMP	RAX, 0	;
+		JLE	endL	;
+		FLD	DWORD [RBX]	;
+		ADD	RBX, [RBP+linc]	;
+		FLD	DWORD [RCX]	;
+		ADD	RCX, [RBP+rinc]	;
+		FMULP	;
+		FSTP	DWORD [RDX]	;
+		ADD	RDX, [RBP+dinc]	;
+		DEC	RAX	;
+		JMP	start	;
+		endL:
+		FWAIT	;
+	END EMulARARLoopA;
+
+	PROCEDURE EMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.AMD64, SYSTEM.SSE2}
+		MOV	RAX, [RBP+len]	;
+		CMP	RAX, 0	;
+		JLE	endL	;  nothing TO be done, RAX > 0 guaranteed from here on
+		MOV	RBX, [RBP+ladr]	;
+		MOV	RCX, [RBP+radr]	;
+		MOV	RDX, [RBP+dadr]	;
+		;  check IF data are contiguous IN memory
+		CMP	[RBP+linc], 8	;  check left FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[RBP+rinc], 8	;  check right FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[RBP+dinc], 8	;  check destination FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		;  check FOR alignment
+		MOV	RSI, RBX	;
+		AND	RSI, 7	;  ladr MOD 8
+		CMP	RSI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	RSI, RCX	;
+		AND	RSI, 7	;  radr MOD 8
+		CMP	RSI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	RSI, RDX	;
+		AND	RSI, 7	;  dadr MOD 8
+		CMP	RSI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	RSI, RBX	;
+		AND	RSI, 8	;  16 byte alignment
+		MOV	RDI, RCX	;
+		AND	RDI, 8	;  16 byte alignment
+		CMP	RSI, RDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF ladr and radr
+		MOV	RDI, RDX	;
+		AND	RDI, 8	;  16 byte alignment
+		CMP	RSI, RDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF dadr and radr
+		CMP	RSI, 8	;
+		JNE	aligned	;  lad, radr and dadr already 128 bit aligned
+		;  one single element processing TO achieve 128 bt alignment
+		MOVSD	XMM1, [RBX]	;
+		MOVSD	XMM0, [RCX]	;
+		MULSD	XMM0, XMM1	;
+		MOVSD	[RDX], XMM0	;
+		ADD	RBX, 8	;  now RBX IS 16 byte aligned
+		ADD	RCX, 8	;  now RDX IS 16 byte aligned	;
+		ADD	RDX, 8	;  now RDX IS 16 byte aligned	;
+		DEC	RAX	;  one element has been processed
+		aligned:
+		aligned8:
+		CMP	RAX, 8	;
+		JL	aligned2	;  len < 4- > EXIT TO singlepieces
+		MOVAPD	XMM0, [RBX]	;
+		MOVAPD	XMM1, [RBX+16]	;
+		MOVAPD	XMM2, [RBX+32]	;
+		MOVAPD	XMM3, [RBX+48]	;
+		ADD	RBX, 64	;
+		MOVAPD	XMM4, [RCX]	;
+		MOVAPD	XMM5, [RCX+16]	;
+		MOVAPD	XMM6, [RCX+32]	;
+		MOVAPD	XMM7, [RCX+48]	;
+		ADD	RCX, 64	;
+		MULPD	XMM0, XMM4	;
+		MULPD	XMM1, XMM5	;
+		MULPD	XMM2, XMM6	;
+		MULPD	XMM3, XMM7	;
+		MOVAPD	[RDX], XMM0	;
+		MOVAPD	[RDX+16], XMM1	;
+		MOVAPD	[RDX+32], XMM2	;
+		MOVAPD	[RDX+48], XMM3	;
+		ADD	RDX, 64	;
+		SUB	RAX, 8	;
+		JMP	aligned8	;
+		;  LOOP FOR 2 pieces aligned
+		aligned2: ;
+		CMP	RAX, 2	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVAPD	XMM0, [RBX]	;
+		ADD	RBX, 16	;
+		MOVAPD	XMM1, [RCX]	;
+		ADD	RCX, 16	;
+		MULPD	XMM0, XMM1	;
+		MOVAPD	[RDX], XMM0	;
+		ADD	RDX, 16	;
+		SUB	RAX, 2	;
+		JMP	aligned2	;
+		;  LOOP FOR 8 unaligned pieces(14 pieces not better!)
+		unaligned: ;
+		unaligned8: ;
+		CMP	RAX, 8	;
+		JL	unaligned2	;  len < 4- > EXIT TO singlepieces
+		MOVUPD	XMM0, [RBX]	;
+		MOVUPD	XMM1, [RBX+16]	;
+		MOVUPD	XMM2, [RBX+32]	;
+		MOVUPD	XMM3, [RBX+48]	;
+		ADD	RBX, 64	;
+		MOVUPD	XMM4, [RCX]	;
+		MOVUPD	XMM5, [RCX+16]	;
+		MOVUPD	XMM6, [RCX+32]	;
+		MOVUPD	XMM7, [RCX+48]	;
+		ADD	RCX, 64	;
+		MULPD	XMM0, XMM4	;
+		MULPD	XMM1, XMM5	;
+		MULPD	XMM2, XMM6	;
+		MULPD	XMM3, XMM7	;
+		MOVUPD	[RDX], XMM0	;
+		MOVUPD	[RDX+16], XMM1	;
+		MOVUPD	[RDX+32], XMM2	;
+		MOVUPD	[RDX+48], XMM3	;
+		ADD	RDX, 64	;
+		SUB	RAX, 8	;
+		JMP	unaligned8	;
+		;  LOOP FOR 2 pieces aligned
+		unaligned2: ;
+		CMP	RAX, 2	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVUPD	XMM0, [RBX]	;
+		ADD	RBX, 16	;
+		MOVUPD	XMM1, [RCX]	;
+		ADD	RCX, 16	;
+		MULPD	XMM0, XMM1	;
+		MOVUPD	[RDX], XMM0	;
+		ADD	RDX, 16	;
+		SUB	RAX, 2	;
+		JMP	unaligned2	;
+		;  one piece left OR non-contiguous data
+		single:
+		singlepieces: ;
+		CMP	RAX, 0	;
+		JLE	endL	;  len <= 0- > EXIT
+		MOVSD	XMM0, [RBX]
+		ADD	RBX, [RBP+linc]	;  INC(ladr, incl)
+		MOVSD	XMM1, [RCX]
+		ADD	RCX, [RBP+rinc]	;  INC(ladr, incl)
+		MULSD	XMM0, XMM1	;
+		MOVSD	[RDX], XMM0
+		ADD	RDX, [RBP+dinc]	;  INC(radr, incr)
+		DEC	RAX	;  DEC(len)
+		JMP	singlepieces	;
+		endL:
+	END EMulAXAXLoopSSE;
+
+	PROCEDURE EMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.AMD64, SYSTEM.SSE2}
+		MOV	RAX, [RBP+len]	;
+		CMP	RAX, 0	;
+		JLE	endL	;  nothing TO be done, RAX > 0 guaranteed from here on
+		MOV	RBX, [RBP+ladr]	;
+		MOV	RCX, [RBP+radr]	;
+		MOV	RDX, [RBP+dadr]	;
+		;  check IF data are contiguous IN memory
+		CMP	[RBP+linc], 4	;  check left FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[RBP+rinc], 4	;  check right FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[RBP+dinc], 4	;  check destination FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		;  check FOR alignment
+		MOV	RSI, RBX	;
+		AND	RSI, 3	;  ladr MOD 4
+		CMP	RSI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	RSI, RCX	;
+		AND	RSI, 3	;  radr MOD 4
+		CMP	RSI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	RSI, RDX	;
+		AND	RSI, 3	;  dadr MOD 4
+		CMP	RSI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	RSI, RBX	;
+		AND	RSI, 8+4	;  16 byte alignment?
+		MOV	RDI, RCX	;
+		AND	RDI, 8+4	;  16 byte alignment?
+		CMP	RSI, RDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF ladr and radr
+		MOV	RDI, RDX	;
+		AND	RDI, 8+4	;  16 byte alignment
+		CMP	RSI, RDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF dadr and radr
+		CMP	RSI, 0	;
+		JE	aligned	;  already aligned
+		align:
+		;  one single element processing UNTIL 128 bt alignment achieved
+		MOVSS	XMM1, [RBX]	;
+		MOVSS	XMM0, [RCX]	;
+		MULSS	XMM0, XMM1	;
+		MOVSS	[RDX], XMM0	;
+		ADD	RBX, 4	;
+		ADD	RCX, 4	;
+		ADD	RDX, 4	;
+		DEC	RAX	;  one element has been processed	;
+		CMP	RAX, 0	;  all elements already processed?
+		JLE	single	;
+		MOV	RSI, RBX	;
+		AND	RSI, 8+4	;
+		CMP	RSI, 0	;
+		JNE	align	;
+		aligned:
+		aligned16:
+		CMP	RAX, 16	;
+		JL	aligned4	;  len < 16- > EXIT TO singlepieces
+		MOVAPS	XMM0, [RBX]	;
+		MOVAPS	XMM1, [RBX+16]	;
+		MOVAPS	XMM2, [RBX+32]	;
+		MOVAPS	XMM3, [RBX+48]	;
+		ADD	RBX, 64	;
+		MOVAPS	XMM4, [RCX]	;
+		MOVAPS	XMM5, [RCX+16]	;
+		MOVAPS	XMM6, [RCX+32]	;
+		MOVAPS	XMM7, [RCX+48]	;
+		ADD	RCX, 64	;
+		MULPS	XMM0, XMM4	;
+		MULPS	XMM1, XMM5	;
+		MULPS	XMM2, XMM6	;
+		MULPS	XMM3, XMM7	;
+		MOVAPS	[RDX], XMM0	;
+		MOVAPS	[RDX+16], XMM1	;
+		MOVAPS	[RDX+32], XMM2	;
+		MOVAPS	[RDX+48], XMM3	;
+		ADD	RDX, 64	;
+		SUB	RAX, 16	;
+		JMP	aligned16	;
+		;  LOOP FOR 2 pieces aligned
+		aligned4: ;
+		CMP	RAX, 4	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVAPS	XMM0, [RBX]	;
+		ADD	RBX, 16	;
+		MOVAPS	XMM1, [RCX]	;
+		ADD	RCX, 16	;
+		MULPS	XMM0, XMM1	;
+		MOVAPS	[RDX], XMM0	;
+		ADD	RDX, 16	;
+		SUB	RAX, 4	;
+		JMP	aligned4	;
+		;  LOOP FOR 8 unaligned pieces(14 pieces not better!)
+		unaligned: ;
+		unaligned16: ;
+		CMP	RAX, 16	;
+		JL	unaligned4	;  len < 4- > EXIT TO singlepieces
+		MOVUPS	XMM0, [RBX]	;
+		MOVUPS	XMM1, [RBX+16]	;
+		MOVUPS	XMM2, [RBX+32]	;
+		MOVUPS	XMM3, [RBX+48]	;
+		ADD	RBX, 64	;
+		MOVUPS	XMM4, [RCX]	;
+		MOVUPS	XMM5, [RCX+16]	;
+		MOVUPS	XMM6, [RCX+32]	;
+		MOVUPS	XMM7, [RCX+48]	;
+		ADD	RCX, 64	;
+		MULPS	XMM0, XMM4	;
+		MULPS	XMM1, XMM5	;
+		MULPS	XMM2, XMM6	;
+		MULPS	XMM3, XMM7	;
+		MOVUPS	[RDX], XMM0	;
+		MOVUPS	[RDX+16], XMM1	;
+		MOVUPS	[RDX+32], XMM2	;
+		MOVUPS	[RDX+48], XMM3	;
+		ADD	RDX, 64	;
+		SUB	RAX, 16	;
+		JMP	unaligned16	;
+		;  LOOP FOR 2 pieces aligned
+		unaligned4: ;
+		CMP	RAX, 4	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVUPS	XMM0, [RBX]	;
+		ADD	RBX, 16	;
+		MOVUPS	XMM1, [RCX]	;
+		ADD	RCX, 16	;
+		MULPS	XMM0, XMM1	;
+		MOVUPS	[RDX], XMM0	;
+		ADD	RDX, 16	;
+		SUB	RAX, 4	;
+		JMP	unaligned4	;
+		;  one piece left OR non-contiguous data
+		single:
+		singlepieces: ;
+		CMP	RAX, 0	;
+		JLE	endL	;  len <= 0- > EXIT
+		MOVSS	XMM0, [RBX]
+		ADD	RBX, [RBP+linc]	;  INC(ladr, incl)
+		MOVSS	XMM1, [RCX]
+		ADD	RCX, [RBP+rinc]	;  INC(ladr, incl)
+		MULSS	XMM0, XMM1	;
+		MOVSS	[RDX], XMM0
+		ADD	RDX, [RBP+dinc]	;  INC(radr, incr)
+		DEC	RAX	;  DEC(len)
+		JMP	singlepieces	;
+		endL:
+	END EMulARARLoopSSE;
 
 	PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
 	CODE {SYSTEM.AMD64, SYSTEM.FPU}
@@ -6775,6 +7113,8 @@ VAR
 		ArrayBase.loopAddARAR := AddARARLoopA;
 		ArrayBase.loopSubAXAX := SubAXAXLoopA;
 		ArrayBase.loopSubARAR := SubARARLoopA;
+		ArrayBase.loopEMulAXAX := EMulAXAXLoopA;
+		ArrayBase.loopEMulARAR := EMulARARLoopA;
 		ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
 		ArrayBase.loopMatMulARAR := MatMulARARLoopA;
 		ArrayBase.loopMulAXSX := MulAXSXLoopA;
@@ -6794,6 +7134,7 @@ VAR
 			ArrayBase.loopSPARAR := SPARARLoopSSE;
 			ArrayBase.loopAddARAR := AddARARLoopSSE;
 			ArrayBase.loopSubARAR := SubARARLoopSSE;
+			ArrayBase.loopEMulARAR := EMulARARLoopSSE;
 			ArrayBase.loopMulARSR := MulARSRLoopSSE;
 			ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
 			ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
@@ -6818,6 +7159,7 @@ VAR
 			ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
 			ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
 			ArrayBase.loopSubAXAX := SubAXAXLoopSSE;
+			ArrayBase.loopEMulAXAX := EMulAXAXLoopSSE;
 			ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
 			ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
 			ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;

+ 9 - 7
source/FoxArrayBase.Mod

@@ -155,6 +155,7 @@ VAR
 	loopSPAZAZ, loopSPALZALZ: BinaryAASLoop;
 	loopAddAXAX*, loopAddARAR*, loopAddAZAZ*, loopAddALZALZ*: BinaryAAALoop;
 	loopSubAXAX*, loopSubARAR*, loopSubAZAZ*, loopSubALZALZ*: BinaryAAALoop;
+	loopEMulAXAX*, loopEMulARAR*, loopEMulAZAZ*, loopEMulALZALZ*: BinaryAAALoop;
 	loopMatMulAXAX*, loopMatMulARAR*: BinaryAASLoop;
 	loopMatMulIncAXAX*, loopMatMulIncARAR*: BinaryAASLoop;
 	loopMulAXSX*, loopMulARSR*, loopMulAZSZ*, loopMulALZSLZ*: BinaryASALoop;
@@ -186,10 +187,11 @@ VAR
 	PROCEDURE SetDefaults*;   (* set standard procedures *)
 	BEGIN
 		KernelLog.String( "ArrayBase XXXXXXX: setting runtime library (semi-optimized) default methods." );  KernelLog.Ln;  loopSPAXAX := SPAXAXLoop;
-		loopSPARAR := SPARARLoop;  loopAddAXAX := AddAXAXLoop;  loopSubAXAX := SubAXAXLoop;
+		loopSPARAR := SPARARLoop;  loopAddAXAX := AddAXAXLoop;  loopSubAXAX := SubAXAXLoop; loopEMulAXAX := EMulAXAXLoop;
 		loopSPAZAZ := SPAZAZLoop; loopSPALZALZ := SPALZALZLoop;
-		loopAddARAR := AddARARLoop; loopSubARAR := SubARARLoop;  loopMatMulAXAX := MatMulAXAXLoop;
-		loopAddAZAZ := AddAZAZLoop; loopAddALZALZ := AddALZALZLoop; loopSubAZAZ := SubAZAZLoop; loopSubALZALZ := SubALZALZLoop;
+		loopAddARAR := AddARARLoop; loopSubARAR := SubARARLoop; loopEMulARAR := EMulARARLoop; loopMatMulAXAX := MatMulAXAXLoop;
+		loopAddAZAZ := AddAZAZLoop; loopAddALZALZ := AddALZALZLoop; loopSubAZAZ := SubAZAZLoop; loopSubALZALZ := SubALZALZLoop; 
+		loopEMulAZAZ := EMulAZAZLoop; loopEMulALZALZ := EMulALZALZLoop;
 		loopMatMulIncAXAX := MatMulIncAXAXLoop;
 		loopMatMulARAR := MatMulARARLoop;  loopMulAXSX := MulAXSXLoop;
 		loopIncMulAXSX := IncMulAXSXLoop;
@@ -3005,7 +3007,7 @@ Sufficient (but not necessary) conditions:
 	OPERATOR ".*"*(CONST left,right: ARRAY [?] OF REAL): ARRAY {UNSAFE} [?] OF REAL;
 	BEGIN
 		ApplyBinaryAAAOp( RESULT, left, right, SIZEOF( REAL ),
-										  EMulARARLoop );
+										  loopEMulARAR );
 		RETURN RESULT
 	END ".*";
 
@@ -3022,7 +3024,7 @@ Sufficient (but not necessary) conditions:
 	OPERATOR ".*"*(CONST left,right: ARRAY [?] OF LONGREAL): ARRAY {UNSAFE} [?] OF LONGREAL;
 	BEGIN
 		ApplyBinaryAAAOp( RESULT, left, right,
-										  SIZEOF( LONGREAL ), EMulAXAXLoop );
+										  SIZEOF( LONGREAL ), loopEMulAXAX );
 		RETURN RESULT
 	END ".*";
 
@@ -3040,7 +3042,7 @@ Sufficient (but not necessary) conditions:
 	OPERATOR ".*"*(CONST left,right: ARRAY [?] OF COMPLEX): ARRAY {UNSAFE} [?] OF COMPLEX;
 	BEGIN
 		ApplyBinaryAAAOp( RESULT, left, right,
-										  SIZEOF( COMPLEX ), EMulAZAZLoop );
+										  SIZEOF( COMPLEX ), loopEMulAZAZ );
 		RETURN RESULT
 	END ".*";
 
@@ -3060,7 +3062,7 @@ Sufficient (but not necessary) conditions:
 	OPERATOR ".*"*(CONST left,right: ARRAY [?] OF LONGCOMPLEX): ARRAY {UNSAFE} [?] OF LONGCOMPLEX;
 	BEGIN
 		ApplyBinaryAAAOp( RESULT, left, right,
-										  SIZEOF( LONGCOMPLEX ), EMulALZALZLoop );
+										  SIZEOF( LONGCOMPLEX ), loopEMulALZALZ );
 		RETURN RESULT
 	END ".*";
 

+ 342 - 0
source/I386.FoxArrayBaseOptimized.Mod

@@ -1798,6 +1798,344 @@ VAR
 		JMP	singlepieces	;
 		endL:
 	END SubARARLoopSSE;
+	
+	(* *)
+	PROCEDURE EMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.i386, SYSTEM.FPU}
+		MOV	EAX, [EBP+len]	;
+		MOV	EBX, [EBP+ladr]	;
+		MOV	ECX, [EBP+radr]	;
+		MOV	EDX, [EBP+dadr]	;
+		start:
+		CMP	EAX, 0	;
+		JLE	endL	;
+		FLD	QWORD [EBX]	;
+		ADD	EBX, [EBP+linc]	;
+		FLD	QWORD [ECX]	;
+		ADD	ECX, [EBP+rinc]	;
+		FMULP	;
+		FSTP	QWORD [EDX]	;
+		ADD	EDX, [EBP+dinc]	;
+		DEC	EAX	;
+		JMP	start	;
+		endL:
+		FWAIT	;
+	END EMulAXAXLoopA;
+
+	PROCEDURE EMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.i386, SYSTEM.FPU}
+		MOV	EAX, [EBP+len]	;
+		MOV	EBX, [EBP+ladr]	;
+		MOV	ECX, [EBP+radr]	;
+		MOV	EDX, [EBP+dadr]	;
+		start:
+		CMP	EAX, 0	;
+		JLE	endL	;
+		FLD	DWORD [EBX]	;
+		ADD	EBX, [EBP+linc]	;
+		FLD	DWORD [ECX]	;
+		ADD	ECX, [EBP+rinc]	;
+		FMULP	;
+		FSTP	DWORD [EDX]	;
+		ADD	EDX, [EBP+dinc]	;
+		DEC	EAX	;
+		JMP	start	;
+		endL:
+		FWAIT	;
+	END EMulARARLoopA;
+
+	PROCEDURE EMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.i386, SYSTEM.SSE2}
+		MOV	EAX, [EBP+len]	;
+		CMP	EAX, 0	;
+		JLE	endL	;  nothing TO be done, EAX > 0 guaranteed from here on
+		MOV	EBX, [EBP+ladr]	;
+		MOV	ECX, [EBP+radr]	;
+		MOV	EDX, [EBP+dadr]	;
+		;  check IF data are contiguous IN memory
+		CMP	[EBP+linc], 8	;  check left FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[EBP+rinc], 8	;  check right FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[EBP+dinc], 8	;  check destination FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		;  check FOR alignment
+		MOV	ESI, EBX	;
+		AND	ESI, 7	;  ladr MOD 8
+		CMP	ESI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	ESI, ECX	;
+		AND	ESI, 7	;  radr MOD 8
+		CMP	ESI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	ESI, EDX	;
+		AND	ESI, 7	;  dadr MOD 8
+		CMP	ESI, 0	;  = 0- > 64 Bit alignment
+		JNE	unaligned	;  not 64 bit aligned
+		MOV	ESI, EBX	;
+		AND	ESI, 8	;  16 byte alignment
+		MOV	EDI, ECX	;
+		AND	EDI, 8	;  16 byte alignment
+		CMP	ESI, EDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF ladr and radr
+		MOV	EDI, EDX	;
+		AND	EDI, 8	;  16 byte alignment
+		CMP	ESI, EDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF dadr and radr
+		CMP	ESI, 8	;
+		JNE	aligned	;  lad, radr and dadr already 128 bit aligned
+		;  one single element processing TO achieve 128 bt alignment
+		MOVSD	XMM1, [EBX]	;
+		MOVSD	XMM0, [ECX]	;
+		MULSD	XMM0, XMM1	;
+		MOVSD	[EDX], XMM0	;
+		ADD	EBX, 8	;  now EBX IS 16 byte aligned
+		ADD	ECX, 8	;  now EDX IS 16 byte aligned	;
+		ADD	EDX, 8	;  now EDX IS 16 byte aligned	;
+		DEC	EAX	;  one element has been processed
+		aligned:
+		aligned8:
+		CMP	EAX, 8	;
+		JL	aligned2	;  len < 4- > EXIT TO singlepieces
+		MOVAPD	XMM0, [EBX]	;
+		MOVAPD	XMM1, [EBX+16]	;
+		MOVAPD	XMM2, [EBX+32]	;
+		MOVAPD	XMM3, [EBX+48]	;
+		ADD	EBX, 64	;
+		MOVAPD	XMM4, [ECX]	;
+		MOVAPD	XMM5, [ECX+16]	;
+		MOVAPD	XMM6, [ECX+32]	;
+		MOVAPD	XMM7, [ECX+48]	;
+		ADD	ECX, 64	;
+		MULPD	XMM0, XMM4	;
+		MULPD	XMM1, XMM5	;
+		MULPD	XMM2, XMM6	;
+		MULPD	XMM3, XMM7	;
+		MOVAPD	[EDX], XMM0	;
+		MOVAPD	[EDX+16], XMM1	;
+		MOVAPD	[EDX+32], XMM2	;
+		MOVAPD	[EDX+48], XMM3	;
+		ADD	EDX, 64	;
+		SUB	EAX, 8	;
+		JMP	aligned8	;
+		;  LOOP FOR 2 pieces aligned
+		aligned2: ;
+		CMP	EAX, 2	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVAPD	XMM0, [EBX]	;
+		ADD	EBX, 16	;
+		MOVAPD	XMM1, [ECX]	;
+		ADD	ECX, 16	;
+		MULPD	XMM0, XMM1	;
+		MOVAPD	[EDX], XMM0	;
+		ADD	EDX, 16	;
+		SUB	EAX, 2	;
+		JMP	aligned2	;
+		;  LOOP FOR 8 unaligned pieces(14 pieces not better!)
+		unaligned: ;
+		unaligned8: ;
+		CMP	EAX, 8	;
+		JL	unaligned2	;  len < 4- > EXIT TO singlepieces
+		MOVUPD	XMM0, [EBX]	;
+		MOVUPD	XMM1, [EBX+16]	;
+		MOVUPD	XMM2, [EBX+32]	;
+		MOVUPD	XMM3, [EBX+48]	;
+		ADD	EBX, 64	;
+		MOVUPD	XMM4, [ECX]	;
+		MOVUPD	XMM5, [ECX+16]	;
+		MOVUPD	XMM6, [ECX+32]	;
+		MOVUPD	XMM7, [ECX+48]	;
+		ADD	ECX, 64	;
+		MULPD	XMM0, XMM4	;
+		MULPD	XMM1, XMM5	;
+		MULPD	XMM2, XMM6	;
+		MULPD	XMM3, XMM7	;
+		MOVUPD	[EDX], XMM0	;
+		MOVUPD	[EDX+16], XMM1	;
+		MOVUPD	[EDX+32], XMM2	;
+		MOVUPD	[EDX+48], XMM3	;
+		ADD	EDX, 64	;
+		SUB	EAX, 8	;
+		JMP	unaligned8	;
+		;  LOOP FOR 2 pieces aligned
+		unaligned2: ;
+		CMP	EAX, 2	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVUPD	XMM0, [EBX]	;
+		ADD	EBX, 16	;
+		MOVUPD	XMM1, [ECX]	;
+		ADD	ECX, 16	;
+		MULPD	XMM0, XMM1	;
+		MOVUPD	[EDX], XMM0	;
+		ADD	EDX, 16	;
+		SUB	EAX, 2	;
+		JMP	unaligned2	;
+		;  one piece left OR non-contiguous data
+		single:
+		singlepieces: ;
+		CMP	EAX, 0	;
+		JLE	endL	;  len <= 0- > EXIT
+		MOVSD	XMM0, [EBX]
+		ADD	EBX, [EBP+linc]	;  INC(ladr, incl)
+		MOVSD	XMM1, [ECX]
+		ADD	ECX, [EBP+rinc]	;  INC(ladr, incl)
+		MULSD	XMM0, XMM1	;
+		MOVSD	[EDX], XMM0
+		ADD	EDX, [EBP+dinc]	;  INC(radr, incr)
+		DEC	EAX	;  DEC(len)
+		JMP	singlepieces	;
+		endL:
+	END EMulAXAXLoopSSE;
+
+	PROCEDURE EMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
+	CODE {SYSTEM.i386, SYSTEM.SSE2}
+		MOV	EAX, [EBP+len]	;
+		CMP	EAX, 0	;
+		JLE	endL	;  nothing TO be done, EAX > 0 guaranteed from here on
+		MOV	EBX, [EBP+ladr]	;
+		MOV	ECX, [EBP+radr]	;
+		MOV	EDX, [EBP+dadr]	;
+		;  check IF data are contiguous IN memory
+		CMP	[EBP+linc], 4	;  check left FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[EBP+rinc], 4	;  check right FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		CMP	[EBP+dinc], 4	;  check destination FOR contiunuity
+		JNE	single	;  not continuous- > simplest method
+		;  check FOR alignment
+		MOV	ESI, EBX	;
+		AND	ESI, 3	;  ladr MOD 4
+		CMP	ESI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	ESI, ECX	;
+		AND	ESI, 3	;  radr MOD 4
+		CMP	ESI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	ESI, EDX	;
+		AND	ESI, 3	;  dadr MOD 4
+		CMP	ESI, 0	;  = 0- > 32 Bit alignment
+		JNE	unaligned	;  not 32 bit aligned
+		MOV	ESI, EBX	;
+		AND	ESI, 8+4	;  16 byte alignment?
+		MOV	EDI, ECX	;
+		AND	EDI, 8+4	;  16 byte alignment?
+		CMP	ESI, EDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF ladr and radr
+		MOV	EDI, EDX	;
+		AND	EDI, 8+4	;  16 byte alignment
+		CMP	ESI, EDI	;
+		JNE	unaligned	;  different 16 byte = 128 bit alignment OF dadr and radr
+		CMP	ESI, 0	;
+		JE	aligned	;  already aligned
+		align:
+		;  one single element processing UNTIL 128 bt alignment achieved
+		MOVSS	XMM1, [EBX]	;
+		MOVSS	XMM0, [ECX]	;
+		MULSS	XMM0, XMM1	;
+		MOVSS	[EDX], XMM0	;
+		ADD	EBX, 4	;
+		ADD	ECX, 4	;
+		ADD	EDX, 4	;
+		DEC	EAX	;  one element has been processed	;
+		CMP	EAX, 0	;  all elements already processed?
+		JLE	single	;
+		MOV	ESI, EBX	;
+		AND	ESI, 8+4	;
+		CMP	ESI, 0	;
+		JNE	align	;
+		aligned:
+		aligned16:
+		CMP	EAX, 16	;
+		JL	aligned4	;  len < 16- > EXIT TO singlepieces
+		MOVAPS	XMM0, [EBX]	;
+		MOVAPS	XMM1, [EBX+16]	;
+		MOVAPS	XMM2, [EBX+32]	;
+		MOVAPS	XMM3, [EBX+48]	;
+		ADD	EBX, 64	;
+		MOVAPS	XMM4, [ECX]	;
+		MOVAPS	XMM5, [ECX+16]	;
+		MOVAPS	XMM6, [ECX+32]	;
+		MOVAPS	XMM7, [ECX+48]	;
+		ADD	ECX, 64	;
+		MULPS	XMM0, XMM4	;
+		MULPS	XMM1, XMM5	;
+		MULPS	XMM2, XMM6	;
+		MULPS	XMM3, XMM7	;
+		MOVAPS	[EDX], XMM0	;
+		MOVAPS	[EDX+16], XMM1	;
+		MOVAPS	[EDX+32], XMM2	;
+		MOVAPS	[EDX+48], XMM3	;
+		ADD	EDX, 64	;
+		SUB	EAX, 16	;
+		JMP	aligned16	;
+		;  LOOP FOR 2 pieces aligned
+		aligned4: ;
+		CMP	EAX, 4	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVAPS	XMM0, [EBX]	;
+		ADD	EBX, 16	;
+		MOVAPS	XMM1, [ECX]	;
+		ADD	ECX, 16	;
+		MULPS	XMM0, XMM1	;
+		MOVAPS	[EDX], XMM0	;
+		ADD	EDX, 16	;
+		SUB	EAX, 4	;
+		JMP	aligned4	;
+		;  LOOP FOR 8 unaligned pieces(14 pieces not better!)
+		unaligned: ;
+		unaligned16: ;
+		CMP	EAX, 16	;
+		JL	unaligned4	;  len < 4- > EXIT TO singlepieces
+		MOVUPS	XMM0, [EBX]	;
+		MOVUPS	XMM1, [EBX+16]	;
+		MOVUPS	XMM2, [EBX+32]	;
+		MOVUPS	XMM3, [EBX+48]	;
+		ADD	EBX, 64	;
+		MOVUPS	XMM4, [ECX]	;
+		MOVUPS	XMM5, [ECX+16]	;
+		MOVUPS	XMM6, [ECX+32]	;
+		MOVUPS	XMM7, [ECX+48]	;
+		ADD	ECX, 64	;
+		MULPS	XMM0, XMM4	;
+		MULPS	XMM1, XMM5	;
+		MULPS	XMM2, XMM6	;
+		MULPS	XMM3, XMM7	;
+		MOVUPS	[EDX], XMM0	;
+		MOVUPS	[EDX+16], XMM1	;
+		MOVUPS	[EDX+32], XMM2	;
+		MOVUPS	[EDX+48], XMM3	;
+		ADD	EDX, 64	;
+		SUB	EAX, 16	;
+		JMP	unaligned16	;
+		;  LOOP FOR 2 pieces aligned
+		unaligned4: ;
+		CMP	EAX, 4	;
+		JL	singlepieces	;  len < 2- > EXIT TO singlepieces
+		MOVUPS	XMM0, [EBX]	;
+		ADD	EBX, 16	;
+		MOVUPS	XMM1, [ECX]	;
+		ADD	ECX, 16	;
+		MULPS	XMM0, XMM1	;
+		MOVUPS	[EDX], XMM0	;
+		ADD	EDX, 16	;
+		SUB	EAX, 4	;
+		JMP	unaligned4	;
+		;  one piece left OR non-contiguous data
+		single:
+		singlepieces: ;
+		CMP	EAX, 0	;
+		JLE	endL	;  len <= 0- > EXIT
+		MOVSS	XMM0, [EBX]
+		ADD	EBX, [EBP+linc]	;  INC(ladr, incl)
+		MOVSS	XMM1, [ECX]
+		ADD	ECX, [EBP+rinc]	;  INC(ladr, incl)
+		MULSS	XMM0, XMM1	;
+		MOVSS	[EDX], XMM0
+		ADD	EDX, [EBP+dinc]	;  INC(radr, incr)
+		DEC	EAX	;  DEC(len)
+		JMP	singlepieces	;
+		endL:
+	END EMulARARLoopSSE;
 
 	PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
 	CODE {SYSTEM.i386, SYSTEM.FPU}
@@ -6777,6 +7115,8 @@ VAR
 		ArrayBase.loopAddARAR := AddARARLoopA;
 		ArrayBase.loopSubAXAX := SubAXAXLoopA;
 		ArrayBase.loopSubARAR := SubARARLoopA;
+		ArrayBase.loopEMulAXAX := EMulAXAXLoopA;
+		ArrayBase.loopEMulARAR := EMulARARLoopA;
 		ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
 		ArrayBase.loopMatMulARAR := MatMulARARLoopA;
 		ArrayBase.loopMulAXSX := MulAXSXLoopA;
@@ -6796,6 +7136,7 @@ VAR
 			ArrayBase.loopSPARAR := SPARARLoopSSE;
 			ArrayBase.loopAddARAR := AddARARLoopSSE;
 			ArrayBase.loopSubARAR := SubARARLoopSSE;
+			ArrayBase.loopEMulARAR := EMulARARLoopSSE;
 			ArrayBase.loopMulARSR := MulARSRLoopSSE;
 			ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
 			ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
@@ -6820,6 +7161,7 @@ VAR
 			ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
 			ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
 			ArrayBase.loopSubAXAX := SubAXAXLoopSSE;
+			ArrayBase.loopEMulAXAX := EMulAXAXLoopSSE;
 			ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
 			ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
 			ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;