ArrayBaseOptimized.SymW 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. MODULE ArrayBaseOptimized;
  2. IMPORT SYSTEM, ArrayBase, Machine, KernelLog, Commands;
  3. CONST
  4. L2CacheSize = (512 * 1024);
  5. L1BlockN = 5;
  6. L2BARatio = 1;
  7. L0BlockKR = 4;
  8. L1MaxBlockKR = 336;
  9. L2BlockSize = 81920;
  10. L0BlockKX = 2;
  11. L1MaxBlockKX = 256;
  12. debug = FALSE;
  13. parallel = TRUE;
  14. SSE = TRUE;
  15. MaxCachePoolSize = 0;
  16. maxProcesses = 32;
  17. cMatMulDynamic* = -1;
  18. cMatMulScalarProduct* = 0;
  19. cMatMulNaive* = 1;
  20. cMatMulTransposed* = 2;
  21. cMatMulStride* = 3;
  22. cMatMulBlocked* = 4;
  23. TYPE
  24. Cache = POINTER TO RECORD
  25. p: ANY;
  26. adr, size: LONGINT;
  27. prev, next: Cache;
  28. END;
  29. CachePool = OBJECT {EXCLUSIVE}
  30. VAR
  31. first, last: Cache;
  32. PROCEDURE ^ & Init*;
  33. PROCEDURE ^ Acquire(size: LONGINT): Cache;
  34. PROCEDURE ^ Release(c: Cache);
  35. END CachePool;
  36. ComputationObj = OBJECT {EXCLUSIVE}
  37. VAR
  38. done: BOOLEAN;
  39. PROCEDURE ^ & Init*;
  40. PROCEDURE ^ Compute;
  41. PROCEDURE ^ Wait;
  42. BEGIN{ACTIVE, EXCLUSIVE}
  43. END ComputationObj;
  44. MatMulHObjR = OBJECT {EXCLUSIVE} (ComputationObj)
  45. VAR
  46. MatrixA, MatrixB, MatrixC, Stride, IncC, StrideC, RowsA, RowsB, Cols: LONGINT;
  47. add: BOOLEAN;
  48. PROCEDURE ^ & InitR*(MatrixA, MatrixB, MatrixC, Stride, IncC, StrideC, RowsA, RowsB, Cols: LONGINT; add: BOOLEAN);
  49. PROCEDURE ^ Compute;
  50. END MatMulHObjR;
  51. MatMulHObjX = OBJECT {EXCLUSIVE} (ComputationObj)
  52. VAR
  53. MatrixA, MatrixB, MatrixC, Stride, IncC, StrideC, RowsA, RowsB, Cols: LONGINT;
  54. add: BOOLEAN;
  55. PROCEDURE ^ & InitX*(MatrixA, MatrixB, MatrixC, Stride, IncC, StrideC, RowsA, RowsB, Cols: LONGINT; add: BOOLEAN);
  56. PROCEDURE ^ Compute;
  57. END MatMulHObjX;
  58. MultiplyObjectR = OBJECT {EXCLUSIVE} (ComputationObj)
  59. VAR
  60. adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: LONGINT;
  61. start, finished: BOOLEAN;
  62. PROCEDURE ^ & InitR*(adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: LONGINT);
  63. PROCEDURE ^ Compute;
  64. END MultiplyObjectR;
  65. MultiplyObjectX = OBJECT {EXCLUSIVE} (ComputationObj)
  66. VAR
  67. adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: LONGINT;
  68. start, finished: BOOLEAN;
  69. PROCEDURE ^ & InitX*(adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: LONGINT);
  70. PROCEDURE ^ Compute;
  71. END MultiplyObjectX;
  72. VAR
  73. alignedC*, unalignedC*, singleC*: LONGINT;
  74. rejectMatMul*: LONGINT;
  75. matAllocTime*, matCompTime*: LONGINT;
  76. cBlockSize*: LONGINT;
  77. nrProcesses*: LONGINT;
  78. lastUsedBlockSize*: LONGINT;
  79. allocT-, copyT-, zeroT-, compT-: HUGEINT;
  80. cachePool: CachePool;
  81. PROCEDURE - L1Block1XA(adrA, adrB, adrC, K: LONGINT);
  82. CODE
  83. END L1Block1XA;
  84. PROCEDURE - L1Block1XSSE(adrA, adrB, adrC, K: LONGINT);
  85. CODE
  86. END L1Block1XSSE;
  87. PROCEDURE - L1Block5XSSE(adrA, adrB, adrC, IncC, K: LONGINT);
  88. CODE
  89. END L1Block5XSSE;
  90. PROCEDURE - L1Block1RA(adrA, adrB, adrC, K: LONGINT);
  91. CODE
  92. END L1Block1RA;
  93. PROCEDURE - L1Block1RSSE(adrA, adrB, adrC, K: LONGINT);
  94. CODE
  95. END L1Block1RSSE;
  96. PROCEDURE - L1Block5RSSE(adrA, adrB, adrC, IncC, K: LONGINT);
  97. CODE
  98. END L1Block5RSSE;
  99. PROCEDURE - Align4(adr: LONGINT): LONGINT;
  100. CODE
  101. END Align4;
  102. PROCEDURE - Align2(adr: LONGINT): LONGINT;
  103. CODE
  104. END Align2;
  105. PROCEDURE - ZeroR(adr: LONGINT; count: LONGINT);
  106. CODE
  107. END ZeroR;
  108. PROCEDURE - ZeroX(adr: LONGINT; count: LONGINT);
  109. CODE
  110. END ZeroX;
  111. PROCEDURE - ZeroRI(adr, inc: LONGINT; count: LONGINT);
  112. CODE
  113. END ZeroRI;
  114. PROCEDURE - ZeroXI(adr, inc: LONGINT; count: LONGINT);
  115. CODE
  116. END ZeroXI;
  117. PROCEDURE - MovR(from, to0, frominc, count: LONGINT);
  118. CODE
  119. END MovR;
  120. PROCEDURE - MovX(from, to0, frominc, count: LONGINT);
  121. CODE
  122. END MovX;
  123. PROCEDURE - MovR5(src, inc, stride, dest, count: LONGINT);
  124. CODE
  125. END MovR5;
  126. PROCEDURE ^ AddAXAXLoopA(ladr, radr, dadr, linc, rinc, dinc, len: LONGINT);
  127. PROCEDURE ^ AddARARLoopA(ladr, radr, dadr, linc, rinc, dinc, len: LONGINT);
  128. PROCEDURE ^ AddAXAXLoopSSE(ladr, radr, dadr, linc, rinc, dinc, len: LONGINT);
  129. PROCEDURE ^ AddARARLoopSSE(ladr, radr, dadr, linc, rinc, dinc, len: LONGINT);
  130. PROCEDURE ^ SPAXAXLoopA(ladr, radr, dadr, linc, rinc, len: LONGINT);
  131. PROCEDURE ^ SPARARLoopA(ladr, radr, dadr, linc, rinc, len: LONGINT);
  132. PROCEDURE ^ SPAXAXLoopSSE(ladr, radr, dadr, linc, rinc, len: LONGINT);
  133. PROCEDURE ^ SPARARLoopSSE(ladr, radr, dadr, linc, rinc, len: LONGINT);
  134. PROCEDURE ^ MulAXSXLoopA(ladr, radr, dadr, linc, dinc, len: LONGINT);
  135. PROCEDURE ^ MulARSRLoopA(ladr, radr, dadr, linc, dinc, len: LONGINT);
  136. PROCEDURE ^ IncMulAXSXLoopA(ladr, radr, dadr, linc, dinc, len: LONGINT);
  137. PROCEDURE ^ IncMulARSRLoopA(ladr, radr, dadr, linc, dinc, len: LONGINT);
  138. PROCEDURE ^ MulAXSXLoopSSE(ladr, radr, dadr, linc, dinc, len: LONGINT);
  139. PROCEDURE ^ MulARSRLoopSSE(ladr, radr, dadr, linc, dinc, len: LONGINT);
  140. PROCEDURE ^ IncMulAXSXLoopSSE(ladr, radr, dadr, linc, dinc, len: LONGINT);
  141. PROCEDURE ^ IncMulARSRLoopSSE(ladr, radr, dadr, linc, dinc, len: LONGINT);
  142. PROCEDURE ^ AlignedSPXSSE(ladr, radr, dadr, len: LONGINT; add: BOOLEAN);
  143. PROCEDURE ^ AlignedSPRSSE(ladr, radr, dadr, len: LONGINT; add: BOOLEAN);
  144. PROCEDURE ^ Copy4(ladr, dadr, linc, dinc, len: LONGINT);
  145. PROCEDURE ^ Copy8(ladr, dadr, linc, dinc, len: LONGINT);
  146. PROCEDURE ^ Transpose4A(ladr, dadr, lstride, linc, dstride, dinc, rows, cols: LONGINT);
  147. PROCEDURE ^ Transpose4(ladr, dadr, lstride, linc, dstride, dinc, rows, cols: LONGINT);
  148. PROCEDURE ^ Transpose8(ladr, dadr, lstride, linc, dstride, dinc, rows, cols: LONGINT);
  149. PROCEDURE ^ Transpose8A(ladr, dadr, lstride, linc, dstride, dinc, rows, cols: LONGINT);
  150. PROCEDURE ^ SSEMul24BlockR(VAR CbFirst: LONGINT; StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb, matrixA, matrixB, matrixC: LONGINT; add: BOOLEAN);
  151. PROCEDURE ^ SSEMul12BlockX(VAR CbFirst: LONGINT; StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb, matrixA, matrixB, matrixC: LONGINT; add: BOOLEAN);
  152. PROCEDURE ^ SSEMul16BlockR(StrideA, StrideB, StrideC, Ca, Ra, CbFrom, matrixA, matrixB, matrixC: LONGINT; add: BOOLEAN);
  153. PROCEDURE ^ SSEMul8BlockX(StrideA, StrideB, StrideC, Ca, Ra, CbFrom, matrixA, matrixB, matrixC: LONGINT; add: BOOLEAN);
  154. PROCEDURE ^ SSEMul8BlockR(StrideA, StrideB, StrideC, Ca, Ra, CbFrom, matrixA, matrixB, matrixC: LONGINT; add: BOOLEAN);
  155. PROCEDURE ^ SSEMul4BlockX(StrideA, StrideB, StrideC, Ca, Ra, CbFrom, matrixA, matrixB, matrixC: LONGINT; add: BOOLEAN);
  156. PROCEDURE ^ SSEMul4BlockR(StrideA, StrideB, StrideC, Ca, Ra, CbFrom, matrixA, matrixB, matrixC: LONGINT; add: BOOLEAN);
  157. PROCEDURE ^ SSEMul2BlockX(StrideA, StrideB, StrideC, Ca, Ra, CbFrom, matrixA, matrixB, matrixC: LONGINT; add: BOOLEAN);
  158. PROCEDURE ^ MagicBlockR(M, N, K: LONGINT; VAR L2BlockM, L2BlockN, L2BlockK: LONGINT);
  159. PROCEDURE ^ MagicBlockX(M, N, K: LONGINT; VAR L2BlockM, L2BlockN, L2BlockK: LONGINT);
  160. PROCEDURE ^ DispCR(adrM: LONGINT; inc, stride, M, N: LONGINT);
  161. PROCEDURE ^ DispCX(adrM: LONGINT; inc, stride, M, N: LONGINT);
  162. PROCEDURE ^ L3BlockX(matrixA, matrixB, matrixC: LONGINT; M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: LONGINT);
  163. PROCEDURE ^ L3BlockR(matrixA, matrixB, matrixC: LONGINT; M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: LONGINT);
  164. PROCEDURE ^ Align(adr: LONGINT; align: LONGINT): LONGINT;
  165. PROCEDURE ^ CopyAX(matrixA, dest: LONGINT; IncA, StrideA: LONGINT; K, M, L2BlockK, L2BlockM: LONGINT);
  166. PROCEDURE ^ CopyAR(matrixA, dest: LONGINT; IncA, StrideA: LONGINT; K, M, L2BlockK, L2BlockM: LONGINT);
  167. PROCEDURE ^ CopyBX(matrixB, dest: LONGINT; IncB, StrideB: LONGINT; N, K, L2BlockN, L2BlockK: LONGINT);
  168. PROCEDURE ^ CopyBR(matrixB, dest: LONGINT; IncB, StrideB: LONGINT; N, K, L2BlockN, L2BlockK: LONGINT);
  169. PROCEDURE - GetTimer(): HUGEINT;
  170. CODE
  171. END GetTimer;
  172. PROCEDURE ^ Tic(VAR t: HUGEINT);
  173. PROCEDURE ^ Toc(VAR t, addto: HUGEINT);
  174. PROCEDURE ^ MultiplyX(A, B, C, M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; IncA, StrideA, IncB, StrideB, IncC, StrideC: LONGINT; add: BOOLEAN);
  175. PROCEDURE ^ MultiplyR(A, B, C, M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; IncA, StrideA, IncB, StrideB, IncC, StrideC: LONGINT; add: BOOLEAN);
  176. PROCEDURE ^ MatMulAXAXLoopA(ladr, radr, dadr, linc, rinc, len: LONGINT);
  177. PROCEDURE ^ MatMulAXAXLoopSSE(ladr, radr, dadr, linc, rinc, len: LONGINT);
  178. PROCEDURE ^ MatMulARARLoopA(ladr, radr, dadr, linc, rinc, len: LONGINT);
  179. PROCEDURE ^ MatMulARARLoopSSE(ladr, radr, dadr, linc, rinc, len: LONGINT);
  180. PROCEDURE ^ MatMulIncAXAXLoopA(ladr, radr, dadr, linc, rinc, len: LONGINT);
  181. PROCEDURE ^ MatMulIncAXAXLoopSSE(ladr, radr, dadr, linc, rinc, len: LONGINT);
  182. PROCEDURE ^ MatMulIncARARLoopA(ladr, radr, dadr, linc, rinc, len: LONGINT);
  183. PROCEDURE ^ MatMulIncARARLoopSSE(ladr, radr, dadr, linc, rinc, len: LONGINT);
  184. PROCEDURE ^ MatMulHBlockR(MatrixA, MatrixB, MatrixC, Stride, IncC, StrideC, RowsA, RowsB, Cols: LONGINT; add: BOOLEAN);
  185. PROCEDURE ^ MatMulHBlockX(MatrixA, MatrixB, MatrixC, Stride, IncC, StrideC, RowsA, RowsB, Cols: LONGINT; add: BOOLEAN);
  186. PROCEDURE ^ CopyDataR(src, dest, incSrc, strideSrc, incDest, strideDest, rows, cols: LONGINT);
  187. PROCEDURE ^ CopyDataX(src, dest, incSrc, strideSrc, incDest, strideDest, rows, cols: LONGINT);
  188. PROCEDURE ^ MatMulARARTransposed(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT; add: BOOLEAN): BOOLEAN;
  189. PROCEDURE ^ MatMulAXAXTransposed(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT; add: BOOLEAN): BOOLEAN;
  190. PROCEDURE ^ MatMulARARSSEStride(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT; add: BOOLEAN): BOOLEAN;
  191. PROCEDURE ^ MatMulAXAXSSEStride(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT; add: BOOLEAN): BOOLEAN;
  192. PROCEDURE ^ MatMulARARNaiive(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: LONGINT; add: BOOLEAN);
  193. PROCEDURE ^ MatMulAXAXNaiive(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: LONGINT; add: BOOLEAN);
  194. PROCEDURE ^ BestMethod(M, N, K: LONGINT): LONGINT;
  195. PROCEDURE ^ MatMulR(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  196. PROCEDURE ^ MatMulX(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  197. PROCEDURE ^ MatMulIncR(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  198. PROCEDURE ^ MatMulIncX(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  199. PROCEDURE ^ MatMulARARBlocked(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT; add: BOOLEAN): BOOLEAN;
  200. PROCEDURE ^ MatMulAXAXBlocked(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT; add: BOOLEAN): BOOLEAN;
  201. PROCEDURE ^ MatMulRNaive(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  202. PROCEDURE ^ MatMulXNaive(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  203. PROCEDURE ^ MatMulIncRNaive(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  204. PROCEDURE ^ MatMulIncXNaive(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  205. PROCEDURE ^ MatMulXTransposed(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  206. PROCEDURE ^ MatMulIncXTransposed(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  207. PROCEDURE ^ MatMulRTransposed(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  208. PROCEDURE ^ MatMulIncRTransposed(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  209. PROCEDURE ^ MatMulXSSEStride(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  210. PROCEDURE ^ MatMulIncXSSEStride(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  211. PROCEDURE ^ MatMulRSSEStride(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  212. PROCEDURE ^ MatMulIncRSSEStride(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  213. PROCEDURE ^ MatMulRBlocked(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  214. PROCEDURE ^ MatMulIncRBlocked(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  215. PROCEDURE ^ MatMulXBlocked(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  216. PROCEDURE ^ MatMulIncXBlocked(matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: LONGINT): BOOLEAN;
  217. PROCEDURE ^ SetMatMulMethod*(i: LONGINT);
  218. PROCEDURE ^ MatMulR2x2(dadr, ladr, radr: LONGINT);
  219. PROCEDURE ^ MatMulR3x3(dadr, ladr, radr: LONGINT);
  220. PROCEDURE ^ MatMulR4x4(dadr, ladr, radr: LONGINT);
  221. PROCEDURE ^ MatVecMulR2x2(dadr, ladr, radr: LONGINT);
  222. PROCEDURE ^ InstallMatMul*(context: Commands.Context);
  223. PROCEDURE ^ InstallAsm*;
  224. PROCEDURE ^ InstallSSE*;
  225. PROCEDURE ^ InstallSSE2*;
  226. PROCEDURE ^ Install*;
  227. PROCEDURE ^ SetParameters*(context: Commands.Context);
  228. BEGIN
  229. END ArrayBaseOptimized.