AMD64.FoxArrayBaseOptimized.Mod 197 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227
  1. MODULE FoxArrayBaseOptimized; (** AUTHOR "fof"; PURPOSE ""; **)
  2. IMPORT SYSTEM, ArrayBase := FoxArrayBase, Machine, KernelLog, Commands ;
  3. CONST
  4. L2CacheSize = 512 * 1024; (* L1CacheSize = 16 * 1024; *)
  5. (* parameters for blocking matrix multiplication *)
  6. L1BlockN = 5; (* L1 block size -> nr of columns in a block that can be processed using L1 chache *)
  7. L2BARatio = 1;
  8. L0BlockKR = 4; (* L0 block size -> nr of elements that can be processed at once for type REAL *)
  9. L1MaxBlockKR = 336; (* L1CacheSize/SIZEOF(REAL)/2/6*)
  10. L2BlockSize = 81920;
  11. L0BlockKX = 2; (* L0 block size -> nr of elements that can be processed at once for type LONGREAL *)
  12. L1MaxBlockKX = 256; (* > L1CacheSize/SIZEOF(LONGREAL)/2/6*)
  13. (*
  14. DefaultL2CacheSize = 81920;
  15. L2SizeR = L2CacheSize DIV 8; MaxBlockKR = 336; (* ca L1CacheSize/SIZEOF(REAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  16. L2SizeX = L2CacheSize DIV 8; MaxBlockKX = 256; (* bit more than L1CacheSize/SIZEL(LONGREAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  17. *)
  18. debug = FALSE; parallel = TRUE; SSE = TRUE;
  19. MaxCachePoolSize = 0 (* disabled *) (* 646*1024*1024 *) (* enabled *) ;
  20. maxProcesses = 48;
  21. cMatMulDynamic* = -1; cMatMulScalarProduct* = 0;
  22. cMatMulNaive* = 1; cMatMulTransposed* = 2;
  23. cMatMulStride* = 3; cMatMulBlocked* = 4;
  24. VAR
  25. cBlockSize*: LONGINT; nrProcesses*: LONGINT;
  26. lastUsedBlockSize*: SIZE;
  27. allocT-, copyT-, zeroT-, compT-: HUGEINT;
  28. TYPE
  29. Cache = POINTER TO RECORD
  30. p: ANY;
  31. adr: ADDRESS; size: SIZE;
  32. prev, next: Cache;
  33. END;
  34. CachePool = OBJECT
  35. (*! provide heuristics for overal size *)
  36. VAR first, last: Cache;
  37. PROCEDURE & Init*;
  38. BEGIN
  39. NEW( first ); first.size := 0; (* sentinel *)
  40. NEW( last ); last.size := MAX( SIZE ); (* sentinel *)
  41. first.next := last; first.prev := NIL; last.prev := first; last.next := NIL;
  42. END Init;
  43. PROCEDURE Acquire( size: SIZE ): Cache;
  44. VAR c: Cache; t: HUGEINT;
  45. BEGIN {EXCLUSIVE}
  46. IF size = 0 THEN RETURN first END;
  47. Tic( t );
  48. c := last;
  49. WHILE (c.prev.size >= size) DO
  50. c := c.prev;
  51. END;
  52. IF c = last THEN
  53. NEW( c ); SYSTEM.NEW( c.p, size + 16 );
  54. c.adr := Align( c.p , 16 );
  55. c.size := size;
  56. ELSE
  57. c.prev.next := c.next;
  58. c.next.prev := c.prev;
  59. c.prev := NIL; c.next := NIL;
  60. END;
  61. Toc( t, allocT ); RETURN c;
  62. END Acquire;
  63. PROCEDURE Release( c: Cache );
  64. VAR t: Cache;
  65. BEGIN {EXCLUSIVE}
  66. IF (c=first) OR (c=NIL) THEN RETURN END;
  67. ASSERT(c.size > 0);
  68. IF c.size > MaxCachePoolSize THEN RETURN END;
  69. t := first;
  70. WHILE (t.size <= c.size) DO t := t.next; END;
  71. c.prev := t.prev; c.next := t; t.prev := c; c.prev.next := c;
  72. END Release;
  73. END CachePool;
  74. ComputationObj = OBJECT
  75. VAR done: BOOLEAN;
  76. PROCEDURE & Init*;
  77. BEGIN
  78. done := FALSE;
  79. END Init;
  80. PROCEDURE Compute; (*abstract*)
  81. END Compute;
  82. PROCEDURE Wait;
  83. BEGIN {EXCLUSIVE}
  84. AWAIT( done );
  85. END Wait;
  86. BEGIN {ACTIVE, EXCLUSIVE}
  87. Compute; done := TRUE;
  88. END ComputationObj;
  89. MatMulHObjR = OBJECT (ComputationObj)
  90. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  91. add: BOOLEAN;
  92. PROCEDURE & InitR*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  93. add: BOOLEAN );
  94. BEGIN
  95. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  96. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  97. SELF.IncC := IncC; SELF.StrideC := StrideC;
  98. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  99. SELF.Cols := Cols; SELF.add := add;
  100. END InitR;
  101. PROCEDURE Compute;
  102. BEGIN
  103. MatMulHBlockR( MatrixA, MatrixB, MatrixC, Stride, IncC,
  104. StrideC, RowsA, RowsB, Cols, add );
  105. END Compute;
  106. END MatMulHObjR;
  107. MatMulHObjX = OBJECT (ComputationObj)
  108. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  109. add: BOOLEAN;
  110. PROCEDURE & InitX*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  111. add: BOOLEAN );
  112. BEGIN
  113. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  114. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  115. SELF.IncC := IncC; SELF.StrideC := StrideC;
  116. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  117. SELF.Cols := Cols; SELF.add := add;
  118. END InitX;
  119. PROCEDURE Compute;
  120. BEGIN
  121. MatMulHBlockX( MatrixA, MatrixB, MatrixC, Stride, IncC,
  122. StrideC, RowsA, RowsB, Cols, add );
  123. END Compute;
  124. END MatMulHObjX;
  125. MultiplyObjectR = OBJECT (ComputationObj);
  126. VAR adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK:SIZE;
  127. start, finished: BOOLEAN;
  128. PROCEDURE & InitR*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  129. BEGIN
  130. Init; start := FALSE; finished := FALSE;
  131. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  132. SELF.M := M; SELF.N := N; SELF.K := K;
  133. SELF.IncC := IncC; SELF.StrideC := StrideC;
  134. SELF.L2BlockM := L2BlockM;
  135. SELF.L2BlockN := L2BlockN;
  136. SELF.L2BlockK := L2BlockK;
  137. END InitR;
  138. PROCEDURE Compute;
  139. BEGIN
  140. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  141. L2BlockN, L2BlockK );
  142. END Compute;
  143. END MultiplyObjectR;
  144. MultiplyObjectX = OBJECT (ComputationObj);
  145. VAR adrA, adrB:ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE;
  146. start, finished: BOOLEAN;
  147. PROCEDURE & InitX*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  148. BEGIN
  149. Init; start := FALSE; finished := FALSE;
  150. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  151. SELF.M := M; SELF.N := N; SELF.K := K;
  152. SELF.IncC := IncC; SELF.StrideC := StrideC;
  153. SELF.L2BlockM := L2BlockM;
  154. SELF.L2BlockN := L2BlockN;
  155. SELF.L2BlockK := L2BlockK;
  156. END InitX;
  157. PROCEDURE Compute;
  158. BEGIN
  159. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  160. L2BlockN, L2BlockK );
  161. END Compute;
  162. END MultiplyObjectX;
  163. VAR
  164. (* ran: Random.Generator; (* testing *)*)
  165. cachePool: CachePool;
  166. (*********** Part 0: assembler routines ***************)
  167. PROCEDURE -L1Block1XA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  168. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  169. MOV RAX, [RSP+K] ; RAX IS counter
  170. MOV RDX, [RSP+adrC]
  171. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  172. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  173. FLD QWORD [RDX] ; S.GET(dadr, x)
  174. loop8:
  175. CMP RAX, 8
  176. JL loop1
  177. FLD QWORD[RBX] ; S.GET(ladr, x)
  178. ADD RBX, 8 ; INC(ladr, incl)
  179. FLD QWORD[RCX] ; S.GET(ladr, y)
  180. ADD RCX, 8 ; INC(radr, incr)
  181. FMULP ; x := x*y
  182. FADDP ; z := z+x
  183. FLD QWORD[RBX] ; S.GET(ladr, x)
  184. ADD RBX, 8 ; INC(ladr, incl)
  185. FLD QWORD[RCX] ; S.GET(ladr, y)
  186. ADD RCX, 8 ; INC(radr, incr)
  187. FMULP ; x := x*y
  188. FADDP ; z := z+x
  189. FLD QWORD[RBX] ; S.GET(ladr, x)
  190. ADD RBX, 8 ; INC(ladr, incl)
  191. FLD QWORD[RCX] ; S.GET(ladr, y)
  192. ADD RCX, 8 ; INC(radr, incr)
  193. FMULP ; x := x*y
  194. FADDP ; z := z+x
  195. FLD QWORD[RBX] ; S.GET(ladr, x)
  196. ADD RBX, 8 ; INC(ladr, incl)
  197. FLD QWORD[RCX] ; S.GET(ladr, y)
  198. ADD RCX, 8 ; INC(radr, incr)
  199. FMULP ; x := x*y
  200. FADDP ; z := z+x
  201. FLD QWORD[RBX] ; S.GET(ladr, x)
  202. ADD RBX, 8 ; INC(ladr, incl)
  203. FLD QWORD[RCX] ; S.GET(ladr, y)
  204. ADD RCX, 8 ; INC(radr, incr)
  205. FMULP ; x := x*y
  206. FADDP ; z := z+x
  207. FLD QWORD[RBX] ; S.GET(ladr, x)
  208. ADD RBX, 8 ; INC(ladr, incl)
  209. FLD QWORD[RCX] ; S.GET(ladr, y)
  210. ADD RCX, 8 ; INC(radr, incr)
  211. FMULP ; x := x*y
  212. FADDP ; z := z+x
  213. FLD QWORD[RBX] ; S.GET(ladr, x)
  214. ADD RBX, 8 ; INC(ladr, incl)
  215. FLD QWORD[RCX] ; S.GET(ladr, y)
  216. ADD RCX, 8 ; INC(radr, incr)
  217. FMULP ; x := x*y
  218. FADDP ; z := z+x
  219. FLD QWORD[RBX] ; S.GET(ladr, x)
  220. ADD RBX, 8 ; INC(ladr, incl)
  221. FLD QWORD[RCX] ; S.GET(ladr, y)
  222. ADD RCX, 8 ; INC(radr, incr)
  223. FMULP ; x := x*y
  224. FADDP ; z := z+x
  225. SUB RAX, 8 ; DEC(len)
  226. JMP loop8 ;
  227. loop1:
  228. CMP RAX, 0 ; WHILE len > 0 DO
  229. JLE endL
  230. FLD QWORD[RBX] ; S.GET(ladr, x)
  231. ADD RBX, 8 ; INC(ladr, incl)
  232. FLD QWORD[RCX] ; S.GET(ladr, y)
  233. ADD RCX, 8 ; INC(radr, incr)
  234. FMULP ; x := x*y
  235. FADDP ; z := z+x
  236. DEC RAX ; DEC(len)
  237. JMP loop1 ;
  238. endL:
  239. FSTP QWORD[RDX] ; S.PUT(dadr, x)
  240. FWAIT ;
  241. ADD RSP, 32 ;
  242. END L1Block1XA;
  243. PROCEDURE -L1Block1XSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  244. (*
  245. matrixA, matrixB must be stored in special format
  246. K>0 guaranteed
  247. *)
  248. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  249. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  250. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  251. MOV RDX, [RSP+K] ; RDX IS counter
  252. XORPD XMM2, XMM2 ;
  253. kLoop8: ;
  254. CMP RDX, 8 ;
  255. JL kLoop2 ;
  256. MOVAPD XMM7, [RBX] ;
  257. MOVAPD XMM0, [RCX] ;
  258. ADD RCX, 16 ;
  259. ADD RBX, 16 ;
  260. MOVAPD XMM6, [RBX] ;
  261. MOVAPD XMM1, [RCX] ;
  262. ADD RCX, 16 ;
  263. ADD RBX, 16 ;
  264. MULPD XMM0, XMM7 ;
  265. ADDPD XMM2, XMM0 ;
  266. MOVAPD XMM5, [RBX] ;
  267. MOVAPD XMM3, [RCX] ;
  268. ADD RCX, 16 ;
  269. ADD RBX, 16 ;
  270. MULPD XMM1, XMM6 ;
  271. ADDPD XMM2, XMM1 ;
  272. MOVAPD XMM7, [RBX] ;
  273. MOVAPD XMM0, [RCX] ;
  274. ADD RCX, 16 ;
  275. ADD RBX, 16 ;
  276. MULPD XMM3, XMM5 ;
  277. ADDPD XMM2, XMM3 ;
  278. MULPD XMM0, XMM7 ;
  279. ADDPD XMM2, XMM0 ;
  280. SUB RDX, 8 ;
  281. JMP kLoop8 ;
  282. kLoop2: ;
  283. CMP RDX, 0 ;
  284. JLE horizontalAdd ;
  285. MOVAPD XMM7, [RBX] ;
  286. MOVAPD XMM0, [RCX] ;
  287. ADD RCX, 16 ;
  288. ADD RBX, 16 ;
  289. MULPD XMM0, XMM7 ;
  290. ADDPD XMM2, XMM0 ;
  291. SUB RDX, 2
  292. JMP kLoop2 ;
  293. horizontalAdd:
  294. MOV RDI, [RSP+adrC] ;
  295. MOVAPD XMM1, XMM2 ;
  296. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  297. ADDPD XMM2, XMM1 ;
  298. ADDSD XMM2, [RDI] ;
  299. MOVSD [RDI], XMM2 ;
  300. endL:
  301. ADD RSP, 32 ;
  302. END L1Block1XSSE;
  303. PROCEDURE -L1Block5XSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  304. (*
  305. matrixA and matrix B are stored in special format !
  306. K > 0 is guaranteed
  307. *)
  308. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  309. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  310. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  311. MOV RDX, [RSP+K] ; RDX IS counter
  312. XORPD XMM2, XMM2 ;
  313. XORPD XMM3, XMM3 ;
  314. XORPD XMM4, XMM4 ;
  315. XORPD XMM5, XMM5 ;
  316. XORPD XMM6, XMM6 ;
  317. kLoop8: ;
  318. CMP RDX, 8 ;
  319. JL kLoop2
  320. ; (*-- 0 -- *) ;
  321. MOVAPD XMM7, [RBX] ; get 4 elements OF A
  322. ADD RBX, 16 ;
  323. MOVAPD XMM0, [RCX] ; get 4 elements OF B
  324. ADD RCX, 16 ;
  325. MOVAPD XMM1, [RCX] ; get 4 elements OF B
  326. ADD RCX, 16 ;
  327. MULPD XMM0, XMM7 ;
  328. ADDPD XMM2, XMM0 ;
  329. MOVAPD XMM0, [RCX] ;
  330. ADD RCX, 16 ;
  331. MULPD XMM1, XMM7 ;
  332. ADDPD XMM3, XMM1 ;
  333. MOVAPD XMM1, [RCX] ;
  334. ADD RCX, 16 ;
  335. MULPD XMM0, XMM7 ;
  336. ADDPD XMM4, XMM0 ;
  337. MOVAPD XMM0, [RCX] ;
  338. ADD RCX, 16 ;
  339. MULPD XMM1, XMM7 ;
  340. ADDPD XMM5, XMM1 ;
  341. MOVAPD XMM1, [RCX] ;
  342. ADD RCX, 16 ;
  343. MULPD XMM0, XMM7 ;
  344. ADDPD XMM6, XMM0
  345. ; (*-- 2 -- *) ;
  346. MOVAPD XMM7, [RBX] ;
  347. ADD RBX, 16 ;
  348. MOVAPD XMM0, [RCX] ;
  349. ADD RCX, 16 ;
  350. MULPD XMM1, XMM7 ;
  351. ADDPD XMM2, XMM1 ;
  352. MOVAPD XMM1, [RCX] ;
  353. ADD RCX, 16 ;
  354. MULPD XMM0, XMM7 ;
  355. ADDPD XMM3, XMM0 ;
  356. MOVAPD XMM0, [RCX] ;
  357. ADD RCX, 16 ;
  358. MULPD XMM1, XMM7 ;
  359. ADDPD XMM4, XMM1 ;
  360. MOVAPD XMM1, [RCX] ;
  361. ADD RCX, 16 ;
  362. MULPD XMM0, XMM7 ;
  363. ADDPD XMM5, XMM0 ;
  364. MOVAPD XMM0, [RCX] ;
  365. ADD RCX, 16 ;
  366. MULPD XMM1, XMM7 ;
  367. ADDPD XMM6, XMM1
  368. ; (*-- 4 -- *) ;
  369. MOVAPD XMM7, [RBX] ;
  370. ADD RBX, 16 ;
  371. MOVAPD XMM1, [RCX] ;
  372. ADD RCX, 16 ;
  373. MULPD XMM0, XMM7 ;
  374. ADDPD XMM2, XMM0 ;
  375. MOVAPD XMM0, [RCX] ;
  376. ADD RCX, 16 ;
  377. MULPD XMM1, XMM7 ;
  378. ADDPD XMM3, XMM1 ;
  379. MOVAPD XMM1, [RCX] ;
  380. ADD RCX, 16 ;
  381. MULPD XMM0, XMM7 ;
  382. ADDPD XMM4, XMM0 ;
  383. MOVAPD XMM0, [RCX] ;
  384. ADD RCX, 16 ;
  385. MULPD XMM1, XMM7 ;
  386. ADDPD XMM5, XMM1 ;
  387. MOVAPD XMM1, [RCX] ;
  388. ADD RCX, 16 ;
  389. MULPD XMM0, XMM7 ;
  390. ADDPD XMM6, XMM0
  391. ; (*-- 6 -- *) ;
  392. MOVAPD XMM7, [RBX] ;
  393. ADD RBX, 16 ;
  394. MOVAPD XMM0, [RCX] ;
  395. ADD RCX, 16 ;
  396. MULPD XMM1, XMM7 ;
  397. ADDPD XMM2, XMM1 ;
  398. MOVAPD XMM1, [RCX] ;
  399. ADD RCX, 16 ;
  400. MULPD XMM0, XMM7 ;
  401. ADDPD XMM3, XMM0 ;
  402. MOVAPD XMM0, [RCX] ;
  403. ADD RCX, 16 ;
  404. MULPD XMM1, XMM7 ;
  405. ADDPD XMM4, XMM1 ;
  406. MOVAPD XMM1, [RCX] ;
  407. ADD RCX, 16 ;
  408. MULPD XMM0, XMM7 ;
  409. ADDPD XMM5, XMM0 ;
  410. MULPD XMM1, XMM7 ;
  411. ADDPD XMM6, XMM1 ;
  412. SUB RDX, 8
  413. JMP kLoop8 ;
  414. kLoop2: ;
  415. CMP RDX, 0 ;
  416. JLE horizontalAdd ;
  417. MOVAPD XMM7, [RBX] ;
  418. ADD RBX, 16 ;
  419. MOVAPD XMM0, [RCX] ;
  420. ADD RCX, 16 ;
  421. MOVAPD XMM1, [RCX] ;
  422. ADD RCX, 16 ;
  423. MULPD XMM0, XMM7 ;
  424. ADDPD XMM2, XMM0 ;
  425. MOVAPD XMM0, [RCX] ;
  426. ADD RCX, 16 ;
  427. MULPD XMM1, XMM7 ;
  428. ADDPD XMM3, XMM1 ;
  429. MOVAPD XMM1, [RCX] ;
  430. ADD RCX, 16 ;
  431. MULPD XMM0, XMM7 ;
  432. ADDPD XMM4, XMM0 ;
  433. MOVAPD XMM0, [RCX] ;
  434. ADD RCX, 16 ;
  435. MULPD XMM1, XMM7 ;
  436. ADDPD XMM5, XMM1 ;
  437. MULPD XMM0, XMM7 ;
  438. ADDPD XMM6, XMM0 ;
  439. SUB RDX, 2
  440. JMP kLoop2 ;
  441. horizontalAdd: ; add and store
  442. MOV RDI, [RSP+adrC] ;
  443. MOV RAX, [RSP+IncC] ;
  444. MOVAPD XMM1, XMM2 ;
  445. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  446. ADDPD XMM2, XMM1 ;
  447. ADDSD XMM2, [RDI] ;
  448. MOVSD [RDI], XMM2 ;
  449. ADD RDI, RAX ;
  450. MOVAPD XMM1, XMM3 ;
  451. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  452. ADDPD XMM3, XMM1 ;
  453. ADDSD XMM3, [RDI] ;
  454. MOVSD [RDI], XMM3 ;
  455. ADD RDI, RAX ;
  456. MOVAPD XMM1, XMM4 ;
  457. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  458. ADDPD XMM4, XMM1 ;
  459. ADDSD XMM4, [RDI] ;
  460. MOVSD [RDI], XMM4 ;
  461. ADD RDI, RAX ;
  462. MOVAPD XMM1, XMM5 ;
  463. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  464. ADDPD XMM5, XMM1 ;
  465. ADDSD XMM5, [RDI] ;
  466. MOVSD [RDI], XMM5 ;
  467. ADD RDI, RAX ;
  468. MOVAPD XMM1, XMM6 ;
  469. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  470. ADDPD XMM6, XMM1 ;
  471. ADDSD XMM6, [RDI] ;
  472. MOVSD [RDI], XMM6 ;
  473. endL:
  474. ADD RSP, 40 ;
  475. END L1Block5XSSE;
  476. PROCEDURE -L1Block1RA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  477. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  478. MOV RAX, [RSP+K] ; RAX IS counter
  479. MOV RDX, [RSP+adrC]
  480. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  481. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  482. FLD DWORD [RDX] ; S.GET(dadr, x)
  483. loop16:
  484. CMP RAX, 16
  485. JL loop1
  486. FLD DWORD[RBX] ; S.GET(ladr, x)
  487. ADD RBX, 4 ; INC(ladr, incl)
  488. FLD DWORD[RCX] ; S.GET(ladr, y)
  489. ADD RCX, 4 ; INC(radr, incr)
  490. FMULP ; x := x*y
  491. FADDP ; z := z+x
  492. FLD DWORD[RBX] ; S.GET(ladr, x)
  493. ADD RBX, 4 ; INC(ladr, incl)
  494. FLD DWORD[RCX] ; S.GET(ladr, y)
  495. ADD RCX, 4 ; INC(radr, incr)
  496. FMULP ; x := x*y
  497. FADDP ; z := z+x
  498. FLD DWORD[RBX] ; S.GET(ladr, x)
  499. ADD RBX, 4 ; INC(ladr, incl)
  500. FLD DWORD[RCX] ; S.GET(ladr, y)
  501. ADD RCX, 4 ; INC(radr, incr)
  502. FMULP ; x := x*y
  503. FADDP ; z := z+x
  504. FLD DWORD[RBX] ; S.GET(ladr, x)
  505. ADD RBX, 4 ; INC(ladr, incl)
  506. FLD DWORD[RCX] ; S.GET(ladr, y)
  507. ADD RCX, 4 ; INC(radr, incr)
  508. FMULP ; x := x*y
  509. FADDP ; z := z+x
  510. FLD DWORD[RBX] ; S.GET(ladr, x)
  511. ADD RBX, 4 ; INC(ladr, incl)
  512. FLD DWORD[RCX] ; S.GET(ladr, y)
  513. ADD RCX, 4 ; INC(radr, incr)
  514. FMULP ; x := x*y
  515. FADDP ; z := z+x
  516. FLD DWORD[RBX] ; S.GET(ladr, x)
  517. ADD RBX, 4 ; INC(ladr, incl)
  518. FLD DWORD[RCX] ; S.GET(ladr, y)
  519. ADD RCX, 4 ; INC(radr, incr)
  520. FMULP ; x := x*y
  521. FADDP ; z := z+x
  522. FLD DWORD[RBX] ; S.GET(ladr, x)
  523. ADD RBX, 4 ; INC(ladr, incl)
  524. FLD DWORD[RCX] ; S.GET(ladr, y)
  525. ADD RCX, 4 ; INC(radr, incr)
  526. FMULP ; x := x*y
  527. FADDP ; z := z+x
  528. FLD DWORD[RBX] ; S.GET(ladr, x)
  529. ADD RBX, 4 ; INC(ladr, incl)
  530. FLD DWORD[RCX] ; S.GET(ladr, y)
  531. ADD RCX, 4 ; INC(radr, incr)
  532. FMULP ; x := x*y
  533. FADDP ; z := z+x
  534. FLD DWORD[RBX] ; S.GET(ladr, x)
  535. ADD RBX, 4 ; INC(ladr, incl)
  536. FLD DWORD[RCX] ; S.GET(ladr, y)
  537. ADD RCX, 4 ; INC(radr, incr)
  538. FMULP ; x := x*y
  539. FADDP ; z := z+x
  540. FLD DWORD[RBX] ; S.GET(ladr, x)
  541. ADD RBX, 4 ; INC(ladr, incl)
  542. FLD DWORD[RCX] ; S.GET(ladr, y)
  543. ADD RCX, 4 ; INC(radr, incr)
  544. FMULP ; x := x*y
  545. FADDP ; z := z+x
  546. FLD DWORD[RBX] ; S.GET(ladr, x)
  547. ADD RBX, 4 ; INC(ladr, incl)
  548. FLD DWORD[RCX] ; S.GET(ladr, y)
  549. ADD RCX, 4 ; INC(radr, incr)
  550. FMULP ; x := x*y
  551. FADDP ; z := z+x
  552. FLD DWORD[RBX] ; S.GET(ladr, x)
  553. ADD RBX, 4 ; INC(ladr, incl)
  554. FLD DWORD[RCX] ; S.GET(ladr, y)
  555. ADD RCX, 4 ; INC(radr, incr)
  556. FMULP ; x := x*y
  557. FADDP ; z := z+x
  558. FLD DWORD[RBX] ; S.GET(ladr, x)
  559. ADD RBX, 4 ; INC(ladr, incl)
  560. FLD DWORD[RCX] ; S.GET(ladr, y)
  561. ADD RCX, 4 ; INC(radr, incr)
  562. FMULP ; x := x*y
  563. FADDP ; z := z+x
  564. FLD DWORD[RBX] ; S.GET(ladr, x)
  565. ADD RBX, 4 ; INC(ladr, incl)
  566. FLD DWORD[RCX] ; S.GET(ladr, y)
  567. ADD RCX, 4 ; INC(radr, incr)
  568. FMULP ; x := x*y
  569. FADDP ; z := z+x
  570. FLD DWORD[RBX] ; S.GET(ladr, x)
  571. ADD RBX, 4 ; INC(ladr, incl)
  572. FLD DWORD[RCX] ; S.GET(ladr, y)
  573. ADD RCX, 4 ; INC(radr, incr)
  574. FMULP ; x := x*y
  575. FADDP ; z := z+x
  576. FLD DWORD[RBX] ; S.GET(ladr, x)
  577. ADD RBX, 4 ; INC(ladr, incl)
  578. FLD DWORD[RCX] ; S.GET(ladr, y)
  579. ADD RCX, 4 ; INC(radr, incr)
  580. FMULP ; x := x*y
  581. FADDP ; z := z+x
  582. SUB RAX, 16 ; DEC(len)
  583. JMP loop16 ;
  584. loop1:
  585. CMP RAX, 0 ; WHILE len > 0 DO
  586. JLE endL
  587. FLD DWORD[RBX] ; S.GET(ladr, x)
  588. ADD RBX, 4 ; INC(ladr, incl)
  589. FLD DWORD[RCX] ; S.GET(ladr, y)
  590. ADD RCX, 4 ; INC(radr, incr)
  591. FMULP ; x := x*y
  592. FADDP ; z := z+x
  593. DEC RAX ; DEC(len)
  594. JMP loop1 ;
  595. endL:
  596. FSTP DWORD[RDX] ; S.PUT(dadr, x)
  597. FWAIT ;
  598. ADD RSP, 32 ;
  599. END L1Block1RA;
  600. PROCEDURE -L1Block1RSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  601. (*
  602. matrixA, matrixB must be stored in special format
  603. K>0 guaranteed
  604. *)
  605. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  606. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  607. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  608. MOV RDX, [RSP+K] ; RDX IS counter
  609. XORPS XMM2, XMM2 ;
  610. kLoop16: ;
  611. CMP RDX, 16 ;
  612. JL kLoop4 ;
  613. MOVAPS XMM7, [RBX] ;
  614. MOVAPS XMM0, [RCX] ;
  615. ADD RCX, 16 ;
  616. ADD RBX, 16 ;
  617. MOVAPS XMM6, [RBX] ;
  618. MOVAPS XMM1, [RCX] ;
  619. ADD RCX, 16 ;
  620. ADD RBX, 16 ;
  621. MULPS XMM0, XMM7 ;
  622. ADDPS XMM2, XMM0 ;
  623. MOVAPS XMM5, [RBX] ;
  624. MOVAPS XMM3, [RCX] ;
  625. ADD RCX, 16 ;
  626. ADD RBX, 16 ;
  627. MULPS XMM1, XMM6 ;
  628. ADDPS XMM2, XMM1 ;
  629. MOVAPS XMM7, [RBX] ;
  630. MOVAPS XMM0, [RCX] ;
  631. ADD RCX, 16 ;
  632. ADD RBX, 16 ;
  633. MULPS XMM3, XMM5 ;
  634. ADDPS XMM2, XMM3 ;
  635. MULPS XMM0, XMM7 ;
  636. ADDPS XMM2, XMM0 ;
  637. SUB RDX, 16 ;
  638. JMP kLoop16 ;
  639. kLoop4: ;
  640. CMP RDX, 0 ;
  641. JLE horizontalAdd ;
  642. MOVAPS XMM7, [RBX] ;
  643. MOVAPS XMM0, [RCX] ;
  644. ADD RCX, 16 ;
  645. ADD RBX, 16 ;
  646. MULPS XMM0, XMM7 ;
  647. ADDPS XMM2, XMM0 ;
  648. SUB RDX, 4
  649. JMP kLoop4 ;
  650. horizontalAdd:
  651. MOV RDI, [RSP+adrC] ;
  652. MOVLHPS XMM1, XMM2 ;
  653. ADDPS XMM1, XMM2 ;
  654. SHUFPS XMM2, XMM1, 48 ;
  655. ADDPS XMM2, XMM1 ;
  656. MOVHLPS XMM2, XMM2 ;
  657. ADDSS XMM2, [RDI] ;
  658. MOVSS [RDI], XMM2 ;
  659. endL:
  660. ADD RSP, 32 ;
  661. END L1Block1RSSE;
  662. PROCEDURE -L1Block5RSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  663. (*
  664. matrixA and matrix B are stored in special format !
  665. K > 0 is guaranteed
  666. *)
  667. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  668. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  669. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  670. MOV RDX, [RSP+K] ; RDX IS counter
  671. XORPS XMM2, XMM2 ;
  672. XORPS XMM3, XMM3 ;
  673. XORPS XMM4, XMM4 ;
  674. XORPS XMM5, XMM5 ;
  675. XORPS XMM6, XMM6 ;
  676. kLoop16: ;
  677. CMP RDX, 16 ;
  678. JL kLoop4 ; (*-- 0 -- *)
  679. MOVAPS XMM7, [RBX] ; get 4 elements OF A
  680. ADD RBX, 16 ;
  681. MOVAPS XMM0, [RCX] ; get 4 elements OF B
  682. ADD RCX, 16 ;
  683. MOVAPS XMM1, [RCX] ; get 4 elements OF B
  684. ADD RCX, 16 ;
  685. MULPS XMM0, XMM7 ;
  686. ADDPS XMM2, XMM0 ;
  687. MOVAPS XMM0, [RCX] ;
  688. ADD RCX, 16 ;
  689. MULPS XMM1, XMM7 ;
  690. ADDPS XMM3, XMM1 ;
  691. MOVAPS XMM1, [RCX] ;
  692. ADD RCX, 16 ;
  693. MULPS XMM0, XMM7 ;
  694. ADDPS XMM4, XMM0 ;
  695. MOVAPS XMM0, [RCX] ;
  696. ADD RCX, 16 ;
  697. MULPS XMM1, XMM7 ;
  698. ADDPS XMM5, XMM1 ;
  699. MOVAPS XMM1, [RCX] ;
  700. ADD RCX, 16 ;
  701. MULPS XMM0, XMM7 ;
  702. ADDPS XMM6, XMM0
  703. ; (*-- 4 -- *) ;
  704. MOVAPS XMM7, [RBX] ;
  705. ADD RBX, 16 ;
  706. MOVAPS XMM0, [RCX] ;
  707. ADD RCX, 16 ;
  708. MULPS XMM1, XMM7 ;
  709. ADDPS XMM2, XMM1 ;
  710. MOVAPS XMM1, [RCX] ;
  711. ADD RCX, 16 ;
  712. MULPS XMM0, XMM7 ;
  713. ADDPS XMM3, XMM0 ;
  714. MOVAPS XMM0, [RCX] ;
  715. ADD RCX, 16 ;
  716. MULPS XMM1, XMM7 ;
  717. ADDPS XMM4, XMM1 ;
  718. MOVAPS XMM1, [RCX] ;
  719. ADD RCX, 16 ;
  720. MULPS XMM0, XMM7 ;
  721. ADDPS XMM5, XMM0 ;
  722. MOVAPS XMM0, [RCX] ;
  723. ADD RCX, 16 ;
  724. MULPS XMM1, XMM7 ;
  725. ADDPS XMM6, XMM1
  726. ; (*-- 8 -- *) ;
  727. MOVAPS XMM7, [RBX] ;
  728. ADD RBX, 16 ;
  729. MOVAPS XMM1, [RCX] ;
  730. ADD RCX, 16 ;
  731. MULPS XMM0, XMM7 ;
  732. ADDPS XMM2, XMM0 ;
  733. MOVAPS XMM0, [RCX] ;
  734. ADD RCX, 16 ;
  735. MULPS XMM1, XMM7 ;
  736. ADDPS XMM3, XMM1 ;
  737. MOVAPS XMM1, [RCX] ;
  738. ADD RCX, 16 ;
  739. MULPS XMM0, XMM7 ;
  740. ADDPS XMM4, XMM0 ;
  741. MOVAPS XMM0, [RCX] ;
  742. ADD RCX, 16 ;
  743. MULPS XMM1, XMM7 ;
  744. ADDPS XMM5, XMM1 ;
  745. MOVAPS XMM1, [RCX] ;
  746. ADD RCX, 16 ;
  747. MULPS XMM0, XMM7 ;
  748. ADDPS XMM6, XMM0
  749. ; (*-- 12 -- *) ;
  750. MOVAPS XMM7, [RBX] ;
  751. ADD RBX, 16 ;
  752. MOVAPS XMM0, [RCX] ;
  753. ADD RCX, 16 ;
  754. MULPS XMM1, XMM7 ;
  755. ADDPS XMM2, XMM1 ;
  756. MOVAPS XMM1, [RCX] ;
  757. ADD RCX, 16 ;
  758. MULPS XMM0, XMM7 ;
  759. ADDPS XMM3, XMM0 ;
  760. MOVAPS XMM0, [RCX] ;
  761. ADD RCX, 16 ;
  762. MULPS XMM1, XMM7 ;
  763. ADDPS XMM4, XMM1 ;
  764. MOVAPS XMM1, [RCX] ;
  765. ADD RCX, 16 ;
  766. MULPS XMM0, XMM7 ;
  767. ADDPS XMM5, XMM0 ;
  768. MULPS XMM1, XMM7 ;
  769. ADDPS XMM6, XMM1 ;
  770. SUB RDX, 16
  771. JMP kLoop16 ;
  772. kLoop4: ;
  773. CMP RDX, 0 ;
  774. JLE horizontalAdd ;
  775. MOVAPS XMM7, [RBX] ;
  776. ADD RBX, 16 ;
  777. MOVAPS XMM0, [RCX] ;
  778. ADD RCX, 16 ;
  779. MOVAPS XMM1, [RCX] ;
  780. ADD RCX, 16 ;
  781. MULPS XMM0, XMM7 ;
  782. ADDPS XMM2, XMM0 ;
  783. MOVAPS XMM0, [RCX] ;
  784. ADD RCX, 16 ;
  785. MULPS XMM1, XMM7 ;
  786. ADDPS XMM3, XMM1 ;
  787. MOVAPS XMM1, [RCX] ;
  788. ADD RCX, 16 ;
  789. MULPS XMM0, XMM7 ;
  790. ADDPS XMM4, XMM0 ;
  791. MOVAPS XMM0, [RCX] ;
  792. ADD RCX, 16 ;
  793. MULPS XMM1, XMM7 ;
  794. ADDPS XMM5, XMM1 ;
  795. MULPS XMM0, XMM7 ;
  796. ADDPS XMM6, XMM0 ;
  797. SUB RDX, 4
  798. JMP kLoop4 ;
  799. horizontalAdd: ; add and store
  800. MOV RDI, [RSP+adrC] ;
  801. MOV RAX, [RSP+IncC] ;
  802. MOVLHPS XMM1, XMM2 ;
  803. ADDPS XMM1, XMM2 ;
  804. SHUFPS XMM2, XMM1, 48 ;
  805. ADDPS XMM2, XMM1 ;
  806. MOVHLPS XMM2, XMM2 ;
  807. ADDSS XMM2, [RDI] ;
  808. MOVSS [RDI], XMM2 ;
  809. ADD RDI, RAX ;
  810. MOVLHPS XMM1, XMM3 ;
  811. ADDPS XMM1, XMM3 ;
  812. SHUFPS XMM3, XMM1, 48 ;
  813. ADDPS XMM3, XMM1 ;
  814. MOVHLPS XMM3, XMM3 ;
  815. ADDSS XMM3, [RDI] ;
  816. MOVSS [RDI], XMM3 ;
  817. ADD RDI, RAX ;
  818. MOVLHPS XMM1, XMM4 ;
  819. ADDPS XMM1, XMM4 ;
  820. SHUFPS XMM4, XMM1, 48 ;
  821. ADDPS XMM4, XMM1 ;
  822. MOVHLPS XMM4, XMM4 ;
  823. ADDSS XMM4, [RDI] ;
  824. MOVSS [RDI], XMM4 ;
  825. ADD RDI, RAX ;
  826. MOVLHPS XMM1, XMM5 ;
  827. ADDPS XMM1, XMM5 ;
  828. SHUFPS XMM5, XMM1, 48 ;
  829. ADDPS XMM5, XMM1 ;
  830. MOVHLPS XMM5, XMM5 ;
  831. ADDSS XMM5, [RDI] ;
  832. MOVSS [RDI], XMM5 ;
  833. ADD RDI, RAX ;
  834. MOVLHPS XMM1, XMM6 ;
  835. ADDPS XMM1, XMM6 ;
  836. SHUFPS XMM6, XMM1, 48 ;
  837. ADDPS XMM6, XMM1 ;
  838. MOVHLPS XMM6, XMM6 ;
  839. ADDSS XMM6, [RDI] ;
  840. MOVSS [RDI], XMM6 ;
  841. endL:
  842. ADD RSP, 40 ;
  843. END L1Block5RSSE;
  844. PROCEDURE -Align4( adr: ADDRESS ): ADDRESS;
  845. CODE {SYSTEM.AMD64}
  846. MOV RAX, [RSP+adr] ;
  847. NEG RAX ;
  848. AND RAX, 3H ;
  849. ADD RAX, [RSP+adr] ;
  850. ADD RSP, 8
  851. END Align4;
  852. PROCEDURE -Align2( adr: ADDRESS ): ADDRESS;
  853. CODE {SYSTEM.AMD64}
  854. MOV RAX, [RSP+adr] ;
  855. NEG RAX ;
  856. AND RAX, 1H ;
  857. ADD RAX, [RSP+adr] ;
  858. ADD RSP, 8
  859. END Align2;
  860. PROCEDURE -ZeroR( adr: ADDRESS; count: SIZE );
  861. (** For 32 bit types *)
  862. CODE {SYSTEM.AMD64}
  863. MOV RDI, [RSP+adr] ; address OF dest index
  864. MOV RCX, [RSP+count] ; counter
  865. MOV RAX, 0 ; value
  866. CLD ; incremental
  867. REP ;
  868. STOSD ;
  869. ADD RSP, 16 ;
  870. END ZeroR;
  871. PROCEDURE -ZeroX( adr: ADDRESS; count: SIZE );
  872. (** For 64 bit types *)
  873. CODE {SYSTEM.AMD64}
  874. MOV RDI, [RSP+adr] ; address OF dest index
  875. MOV RCX, [RSP+count] ; counter
  876. SHL RCX, 1 ;
  877. MOV RAX, 0 ; value
  878. CLD ; incremental
  879. REP ;
  880. STOSD ;
  881. ADD RSP, 16 ;
  882. END ZeroX;
  883. PROCEDURE -ZeroRI( adr: SIZE; inc, count: SIZE );
  884. (** For 32 bit types *)
  885. CODE {SYSTEM.AMD64}
  886. MOV RDI, [RSP+adr] ; address OF dest index
  887. MOV RBX, [RSP+inc] ;
  888. MOV RCX, [RSP+count] ; counter
  889. CMP RBX, 4 ;
  890. JE fastzero ;
  891. MOV RAX, 0 ;
  892. loopL:
  893. CMP RCX, 0 ;
  894. JLE endL ;
  895. MOV [RDI], RAX ;
  896. ADD RDI, RBX ;
  897. DEC RCX ;
  898. JMP loopL ;
  899. fastzero:
  900. MOV RAX, 0 ; value
  901. CLD ; incremental
  902. REP ;
  903. STOSD ;
  904. endL:
  905. ADD RSP, 24 ;
  906. END ZeroRI;
  907. PROCEDURE -ZeroXI( adr: ADDRESS; inc, count: SIZE );
  908. (** For 32 bit types *)
  909. CODE {SYSTEM.AMD64}
  910. MOV RDI, [RSP+adr] ; address OF dest index
  911. MOV RBX, [RSP+inc] ;
  912. MOV RCX, [RSP+count] ; counter
  913. MOV RAX, 0 ;
  914. CMP RBX, 8 ;
  915. JE fastzero ;
  916. loopL:
  917. CMP RCX, 0 ;
  918. JLE endL ;
  919. MOV [RDI], RAX ;
  920. MOV [RDI+4], RAX ;
  921. ADD RDI, RBX ;
  922. DEC RCX ;
  923. JMP loopL ;
  924. fastzero:
  925. SHL RCX, 1 ;
  926. CLD ; incremental
  927. REP ;
  928. STOSD ;
  929. endL:
  930. ADD RSP, 24 ;
  931. END ZeroXI;
  932. PROCEDURE -MovR( from, to0, frominc, count: SIZE );
  933. CODE {SYSTEM.AMD64}
  934. MOV RDI, [RSP+to0] ; TO
  935. MOV RSI, [RSP+from] ; from
  936. MOV RCX, [RSP+count] ; count
  937. MOV RBX, [RSP+frominc] ; inc
  938. CMP RBX, 4 ;
  939. JE fastmove ;
  940. loopL:
  941. CMP RCX, 0 ;
  942. JLE endL ;
  943. MOV RAX, [RSI] ;
  944. MOV [RDI], RAX ;
  945. ADD RSI, RBX ;
  946. ADD RDI, 4 ;
  947. DEC RCX ;
  948. JMP loopL ;
  949. fastmove:
  950. CLD ; incremental
  951. REP ;
  952. MOVSD ; move rest IN one byte steps
  953. endL:
  954. ADD RSP, 32 ;
  955. END MovR;
  956. PROCEDURE -MovX( from, to0: ADDRESS; frominc, count:SIZE );
  957. CODE {SYSTEM.AMD64}
  958. MOV RDI, [RSP+to0] ; TO
  959. MOV RSI, [RSP+from] ; from
  960. MOV RCX, [RSP+count] ; count
  961. MOV RBX, [RSP+frominc] ; inc
  962. CMP RBX, 8 ;
  963. JE fastmove ;
  964. loopL:
  965. CMP RCX, 0 ;
  966. JLE endL ;
  967. MOV RAX, [RSI] ;
  968. MOV [RDI], RAX ;
  969. MOV RAX, [RSI+4] ;
  970. MOV [RDI+4], RAX ;
  971. ADD RSI, RBX ;
  972. ADD RDI, 8 ;
  973. DEC RCX ;
  974. JMP loopL ;
  975. fastmove:
  976. SHL RCX, 1 ;
  977. CLD ; incremental
  978. REP ;
  979. MOVSD ; move rest IN one byte steps
  980. endL:
  981. ADD RSP, 32 ;
  982. END MovX;
  983. PROCEDURE -MovR5( src: ADDRESS; inc, stride: SIZE; dest: ADDRESS; count: SIZE);
  984. CODE {SYSTEM.AMD64}
  985. MOV RSI, [RSP+src] ; src
  986. MOV RBX, [RSP+inc] ; inc
  987. MOV RCX, [RSP+stride] ; stride
  988. MOV RDI, [RSP+dest] ; dest
  989. loopL:
  990. MOV RAX, [RSP+count] ; count
  991. CMP RAX, 0 ;
  992. JLE endL ;
  993. SUB RAX, 4 ;
  994. MOV [RSP+count], RAX ;
  995. MOV RDX, RSI ;
  996. MOV RAX, [RDX] ;
  997. MOV [RDI], RAX ;
  998. ADD RDX, RBX ;
  999. MOV RAX, [RDX] ;
  1000. MOV [RDI+16], RAX ;
  1001. ADD RDX, RBX ;
  1002. MOV RAX, [RDX] ;
  1003. MOV [RDI+32], RAX ;
  1004. ADD RDX, RBX ;
  1005. MOV RAX, [RDX] ;
  1006. MOV [RDI+48], RAX ;
  1007. ADD RDX, RBX ;
  1008. MOV RAX, [RDX] ;
  1009. MOV [RDI+64], RAX ;
  1010. ADD RSI, RCX ;
  1011. ADD RDI, 4 ;
  1012. MOV RDX, RSI ;
  1013. MOV RAX, [RDX] ;
  1014. MOV [RDI], RAX ;
  1015. ADD RDX, RBX ;
  1016. MOV RAX, [RDX] ;
  1017. MOV [RDI+16], RAX ;
  1018. ADD RDX, RBX ;
  1019. MOV RAX, [RDX] ;
  1020. MOV [RDI+32], RAX ;
  1021. ADD RDX, RBX ;
  1022. MOV RAX, [RDX] ;
  1023. MOV [RDI+48], RAX ;
  1024. ADD RDX, RBX ;
  1025. MOV RAX, [RDX] ;
  1026. MOV [RDI+64], RAX ;
  1027. ADD RSI, RCX ;
  1028. ADD RDI, 4 ;
  1029. MOV RDX, RSI ;
  1030. MOV RAX, [RDX] ;
  1031. MOV [RDI], RAX ;
  1032. ADD RDX, RBX ;
  1033. MOV RAX, [RDX] ;
  1034. MOV [RDI+16], RAX ;
  1035. ADD RDX, RBX ;
  1036. MOV RAX, [RDX] ;
  1037. MOV [RDI+32], RAX ;
  1038. ADD RDX, RBX ;
  1039. MOV RAX, [RDX] ;
  1040. MOV [RDI+48], RAX ;
  1041. ADD RDX, RBX ;
  1042. MOV RAX, [RDX] ;
  1043. MOV [RDI+64], RAX ;
  1044. ADD RSI, RCX ;
  1045. ADD RDI, 4 ;
  1046. MOV RDX, RSI ;
  1047. MOV RAX, [RDX] ;
  1048. MOV [RDI], RAX ;
  1049. ADD RDX, RBX ;
  1050. MOV RAX, [RDX] ;
  1051. MOV [RDI+16], RAX ;
  1052. ADD RDX, RBX ;
  1053. MOV RAX, [RDX] ;
  1054. MOV [RDI+32], RAX ;
  1055. ADD RDX, RBX ;
  1056. MOV RAX, [RDX] ;
  1057. MOV [RDI+48], RAX ;
  1058. ADD RDX, RBX ;
  1059. MOV RAX, [RDX] ;
  1060. MOV [RDI+64], RAX ;
  1061. ADD RSI, RCX ;
  1062. ADD RDI, 4 ;
  1063. ADD RDI, 64 ;
  1064. JMP loopL ;
  1065. endL:
  1066. ADD RSP, 40 ;
  1067. END MovR5;
  1068. (* *)
  1069. PROCEDURE AddAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1070. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1071. MOV RAX, [RBP+len] ;
  1072. MOV RBX, [RBP+ladr] ;
  1073. MOV RCX, [RBP+radr] ;
  1074. MOV RDX, [RBP+dadr] ;
  1075. start:
  1076. CMP RAX, 0 ;
  1077. JLE endL ;
  1078. FLD QWORD [RBX] ;
  1079. ADD RBX, [RBP+linc] ;
  1080. FLD QWORD [RCX] ;
  1081. ADD RCX, [RBP+rinc] ;
  1082. FADDP ;
  1083. FSTP QWORD [RDX] ;
  1084. ADD RDX, [RBP+dinc] ;
  1085. DEC RAX ;
  1086. JMP start ;
  1087. endL:
  1088. FWAIT ;
  1089. END AddAXAXLoopA;
  1090. PROCEDURE AddARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1091. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1092. MOV RAX, [RBP+len] ;
  1093. MOV RBX, [RBP+ladr] ;
  1094. MOV RCX, [RBP+radr] ;
  1095. MOV RDX, [RBP+dadr] ;
  1096. start:
  1097. CMP RAX, 0 ;
  1098. JLE endL ;
  1099. FLD DWORD [RBX] ;
  1100. ADD RBX, [RBP+linc] ;
  1101. FLD DWORD [RCX] ;
  1102. ADD RCX, [RBP+rinc] ;
  1103. FADDP ;
  1104. FSTP DWORD [RDX] ;
  1105. ADD RDX, [RBP+dinc] ;
  1106. DEC RAX ;
  1107. JMP start ;
  1108. endL:
  1109. FWAIT ;
  1110. END AddARARLoopA;
  1111. PROCEDURE AddAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1112. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1113. MOV RAX, [RBP+len] ;
  1114. CMP RAX, 0 ;
  1115. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1116. MOV RBX, [RBP+ladr] ;
  1117. MOV RCX, [RBP+radr] ;
  1118. MOV RDX, [RBP+dadr] ;
  1119. ; check IF data are contiguous IN memory
  1120. CMP [RBP+linc], 8 ; check left FOR contiunuity
  1121. JNE single ; not continuous- > simplest method
  1122. CMP [RBP+rinc], 8 ; check right FOR contiunuity
  1123. JNE single ; not continuous- > simplest method
  1124. CMP [RBP+dinc], 8 ; check destination FOR contiunuity
  1125. JNE single ; not continuous- > simplest method
  1126. ; check FOR alignment
  1127. MOV RSI, RBX ;
  1128. AND RSI, 7 ; ladr MOD 8
  1129. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1130. JNE unaligned ; not 64 bit aligned
  1131. MOV RSI, RCX ;
  1132. AND RSI, 7 ; radr MOD 8
  1133. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1134. JNE unaligned ; not 64 bit aligned
  1135. MOV RSI, RDX ;
  1136. AND RSI, 7 ; dadr MOD 8
  1137. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1138. JNE unaligned ; not 64 bit aligned
  1139. MOV RSI, RBX ;
  1140. AND RSI, 8 ; 16 byte alignment
  1141. MOV RDI, RCX ;
  1142. AND RDI, 8 ; 16 byte alignment
  1143. CMP RSI, RDI ;
  1144. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1145. MOV RDI, RDX ;
  1146. AND RDI, 8 ; 16 byte alignment
  1147. CMP RSI, RDI ;
  1148. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1149. CMP RSI, 8 ;
  1150. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1151. ; one single element processing TO achieve 128 bt alignment
  1152. MOVSD XMM1, [RBX] ;
  1153. MOVSD XMM0, [RCX] ;
  1154. ADDSD XMM0, XMM1 ;
  1155. MOVSD [RDX], XMM0 ;
  1156. ADD RBX, 8 ; now RBX IS 16 byte aligned
  1157. ADD RCX, 8 ; now RDX IS 16 byte aligned ;
  1158. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  1159. DEC RAX ; one element has been processed
  1160. aligned:
  1161. aligned8:
  1162. CMP RAX, 8 ;
  1163. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1164. MOVAPD XMM0, [RBX] ;
  1165. MOVAPD XMM1, [RBX+16] ;
  1166. MOVAPD XMM2, [RBX+32] ;
  1167. MOVAPD XMM3, [RBX+48] ;
  1168. ADD RBX, 64 ;
  1169. MOVAPD XMM4, [RCX] ;
  1170. MOVAPD XMM5, [RCX+16] ;
  1171. MOVAPD XMM6, [RCX+32] ;
  1172. MOVAPD XMM7, [RCX+48] ;
  1173. ADD RCX, 64 ;
  1174. ADDPD XMM0, XMM4 ;
  1175. ADDPD XMM1, XMM5 ;
  1176. ADDPD XMM2, XMM6 ;
  1177. ADDPD XMM3, XMM7 ;
  1178. MOVAPD [RDX], XMM0 ;
  1179. MOVAPD [RDX+16], XMM1 ;
  1180. MOVAPD [RDX+32], XMM2 ;
  1181. MOVAPD [RDX+48], XMM3 ;
  1182. ADD RDX, 64 ;
  1183. SUB RAX, 8 ;
  1184. JMP aligned8 ;
  1185. ; LOOP FOR 2 pieces aligned
  1186. aligned2: ;
  1187. CMP RAX, 2 ;
  1188. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1189. MOVAPD XMM0, [RBX] ;
  1190. ADD RBX, 16 ;
  1191. MOVAPD XMM1, [RCX] ;
  1192. ADD RCX, 16 ;
  1193. ADDPD XMM0, XMM1 ;
  1194. MOVAPD [RDX], XMM0 ;
  1195. ADD RDX, 16 ;
  1196. SUB RAX, 2 ;
  1197. JMP aligned2 ;
  1198. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1199. unaligned: ;
  1200. unaligned8: ;
  1201. CMP RAX, 8 ;
  1202. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1203. MOVUPD XMM0, [RBX] ;
  1204. MOVUPD XMM1, [RBX+16] ;
  1205. MOVUPD XMM2, [RBX+32] ;
  1206. MOVUPD XMM3, [RBX+48] ;
  1207. ADD RBX, 64 ;
  1208. MOVUPD XMM4, [RCX] ;
  1209. MOVUPD XMM5, [RCX+16] ;
  1210. MOVUPD XMM6, [RCX+32] ;
  1211. MOVUPD XMM7, [RCX+48] ;
  1212. ADD RCX, 64 ;
  1213. ADDPD XMM0, XMM4 ;
  1214. ADDPD XMM1, XMM5 ;
  1215. ADDPD XMM2, XMM6 ;
  1216. ADDPD XMM3, XMM7 ;
  1217. MOVUPD [RDX], XMM0 ;
  1218. MOVUPD [RDX+16], XMM1 ;
  1219. MOVUPD [RDX+32], XMM2 ;
  1220. MOVUPD [RDX+48], XMM3 ;
  1221. ADD RDX, 64 ;
  1222. SUB RAX, 8 ;
  1223. JMP unaligned8 ;
  1224. ; LOOP FOR 2 pieces aligned
  1225. unaligned2: ;
  1226. CMP RAX, 2 ;
  1227. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1228. MOVUPD XMM0, [RBX] ;
  1229. ADD RBX, 16 ;
  1230. MOVUPD XMM1, [RCX] ;
  1231. ADD RCX, 16 ;
  1232. ADDPD XMM0, XMM1 ;
  1233. MOVUPD [RDX], XMM0 ;
  1234. ADD RDX, 16 ;
  1235. SUB RAX, 2 ;
  1236. JMP unaligned2 ;
  1237. ; one piece left OR non-contiguous data
  1238. single:
  1239. singlepieces: ;
  1240. CMP RAX, 0 ;
  1241. JLE endL ; len <= 0- > EXIT
  1242. MOVSD XMM0, [RBX]
  1243. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1244. MOVSD XMM1, [RCX]
  1245. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1246. ADDSD XMM0, XMM1 ;
  1247. MOVSD [RDX], XMM0
  1248. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1249. DEC RAX ; DEC(len)
  1250. JMP singlepieces ;
  1251. endL:
  1252. END AddAXAXLoopSSE;
  1253. PROCEDURE AddARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1254. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1255. MOV RAX, [RBP+len] ;
  1256. CMP RAX, 0 ;
  1257. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1258. MOV RBX, [RBP+ladr] ;
  1259. MOV RCX, [RBP+radr] ;
  1260. MOV RDX, [RBP+dadr] ;
  1261. ; check IF data are contiguous IN memory
  1262. CMP [RBP+linc], 4 ; check left FOR contiunuity
  1263. JNE single ; not continuous- > simplest method
  1264. CMP [RBP+rinc], 4 ; check right FOR contiunuity
  1265. JNE single ; not continuous- > simplest method
  1266. CMP [RBP+dinc], 4 ; check destination FOR contiunuity
  1267. JNE single ; not continuous- > simplest method
  1268. ; check FOR alignment
  1269. MOV RSI, RBX ;
  1270. AND RSI, 3 ; ladr MOD 4
  1271. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1272. JNE unaligned ; not 32 bit aligned
  1273. MOV RSI, RCX ;
  1274. AND RSI, 3 ; radr MOD 4
  1275. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1276. JNE unaligned ; not 32 bit aligned
  1277. MOV RSI, RDX ;
  1278. AND RSI, 3 ; dadr MOD 4
  1279. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1280. JNE unaligned ; not 32 bit aligned
  1281. MOV RSI, RBX ;
  1282. AND RSI, 8+4 ; 16 byte alignment?
  1283. MOV RDI, RCX ;
  1284. AND RDI, 8+4 ; 16 byte alignment?
  1285. CMP RSI, RDI ;
  1286. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1287. MOV RDI, RDX ;
  1288. AND RDI, 8+4 ; 16 byte alignment
  1289. CMP RSI, RDI ;
  1290. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1291. CMP RSI, 0 ;
  1292. JE aligned ; already aligned
  1293. align:
  1294. ; one single element processing UNTIL 128 bt alignment achieved
  1295. MOVSS XMM1, [RBX] ;
  1296. MOVSS XMM0, [RCX] ;
  1297. ADDSS XMM0, XMM1 ;
  1298. MOVSS [RDX], XMM0 ;
  1299. ADD RBX, 4 ;
  1300. ADD RCX, 4 ;
  1301. ADD RDX, 4 ;
  1302. DEC RAX ; one element has been processed ;
  1303. CMP RAX, 0 ; all elements already processed?
  1304. JLE single ;
  1305. MOV RSI, RBX ;
  1306. AND RSI, 8+4 ;
  1307. CMP RSI, 0 ;
  1308. JNE align ;
  1309. aligned:
  1310. aligned16:
  1311. CMP RAX, 16 ;
  1312. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1313. MOVAPS XMM0, [RBX] ;
  1314. MOVAPS XMM1, [RBX+16] ;
  1315. MOVAPS XMM2, [RBX+32] ;
  1316. MOVAPS XMM3, [RBX+48] ;
  1317. ADD RBX, 64 ;
  1318. MOVAPS XMM4, [RCX] ;
  1319. MOVAPS XMM5, [RCX+16] ;
  1320. MOVAPS XMM6, [RCX+32] ;
  1321. MOVAPS XMM7, [RCX+48] ;
  1322. ADD RCX, 64 ;
  1323. ADDPS XMM0, XMM4 ;
  1324. ADDPS XMM1, XMM5 ;
  1325. ADDPS XMM2, XMM6 ;
  1326. ADDPS XMM3, XMM7 ;
  1327. MOVAPS [RDX], XMM0 ;
  1328. MOVAPS [RDX+16], XMM1 ;
  1329. MOVAPS [RDX+32], XMM2 ;
  1330. MOVAPS [RDX+48], XMM3 ;
  1331. ADD RDX, 64 ;
  1332. SUB RAX, 16 ;
  1333. JMP aligned16 ;
  1334. ; LOOP FOR 2 pieces aligned
  1335. aligned4: ;
  1336. CMP RAX, 4 ;
  1337. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1338. MOVAPS XMM0, [RBX] ;
  1339. ADD RBX, 16 ;
  1340. MOVAPS XMM1, [RCX] ;
  1341. ADD RCX, 16 ;
  1342. ADDPS XMM0, XMM1 ;
  1343. MOVAPS [RDX], XMM0 ;
  1344. ADD RDX, 16 ;
  1345. SUB RAX, 4 ;
  1346. JMP aligned4 ;
  1347. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1348. unaligned: ;
  1349. unaligned16: ;
  1350. CMP RAX, 16 ;
  1351. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1352. MOVUPS XMM0, [RBX] ;
  1353. MOVUPS XMM1, [RBX+16] ;
  1354. MOVUPS XMM2, [RBX+32] ;
  1355. MOVUPS XMM3, [RBX+48] ;
  1356. ADD RBX, 64 ;
  1357. MOVUPS XMM4, [RCX] ;
  1358. MOVUPS XMM5, [RCX+16] ;
  1359. MOVUPS XMM6, [RCX+32] ;
  1360. MOVUPS XMM7, [RCX+48] ;
  1361. ADD RCX, 64 ;
  1362. ADDPS XMM0, XMM4 ;
  1363. ADDPS XMM1, XMM5 ;
  1364. ADDPS XMM2, XMM6 ;
  1365. ADDPS XMM3, XMM7 ;
  1366. MOVUPS [RDX], XMM0 ;
  1367. MOVUPS [RDX+16], XMM1 ;
  1368. MOVUPS [RDX+32], XMM2 ;
  1369. MOVUPS [RDX+48], XMM3 ;
  1370. ADD RDX, 64 ;
  1371. SUB RAX, 16 ;
  1372. JMP unaligned16 ;
  1373. ; LOOP FOR 2 pieces aligned
  1374. unaligned4: ;
  1375. CMP RAX, 4 ;
  1376. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1377. MOVUPS XMM0, [RBX] ;
  1378. ADD RBX, 16 ;
  1379. MOVUPS XMM1, [RCX] ;
  1380. ADD RCX, 16 ;
  1381. ADDPS XMM0, XMM1 ;
  1382. MOVUPS [RDX], XMM0 ;
  1383. ADD RDX, 16 ;
  1384. SUB RAX, 4 ;
  1385. JMP unaligned4 ;
  1386. ; one piece left OR non-contiguous data
  1387. single:
  1388. singlepieces: ;
  1389. CMP RAX, 0 ;
  1390. JLE endL ; len <= 0- > EXIT
  1391. MOVSS XMM0, [RBX]
  1392. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1393. MOVSS XMM1, [RCX]
  1394. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1395. ADDSS XMM0, XMM1 ;
  1396. MOVSS [RDX], XMM0
  1397. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1398. DEC RAX ; DEC(len)
  1399. JMP singlepieces ;
  1400. endL:
  1401. END AddARARLoopSSE;
  1402. (* *)
  1403. PROCEDURE SubAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1404. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1405. MOV RAX, [RBP+len] ;
  1406. MOV RBX, [RBP+ladr] ;
  1407. MOV RCX, [RBP+radr] ;
  1408. MOV RDX, [RBP+dadr] ;
  1409. start:
  1410. CMP RAX, 0 ;
  1411. JLE endL ;
  1412. FLD QWORD [RBX] ;
  1413. ADD RBX, [RBP+linc] ;
  1414. FLD QWORD [RCX] ;
  1415. ADD RCX, [RBP+rinc] ;
  1416. FSUBP ;
  1417. FSTP QWORD [RDX] ;
  1418. ADD RDX, [RBP+dinc] ;
  1419. DEC RAX ;
  1420. JMP start ;
  1421. endL:
  1422. FWAIT ;
  1423. END SubAXAXLoopA;
  1424. PROCEDURE SubARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1425. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1426. MOV RAX, [RBP+len] ;
  1427. MOV RBX, [RBP+ladr] ;
  1428. MOV RCX, [RBP+radr] ;
  1429. MOV RDX, [RBP+dadr] ;
  1430. start:
  1431. CMP RAX, 0 ;
  1432. JLE endL ;
  1433. FLD DWORD [RBX] ;
  1434. ADD RBX, [RBP+linc] ;
  1435. FLD DWORD [RCX] ;
  1436. ADD RCX, [RBP+rinc] ;
  1437. FSUBP ;
  1438. FSTP DWORD [RDX] ;
  1439. ADD RDX, [RBP+dinc] ;
  1440. DEC RAX ;
  1441. JMP start ;
  1442. endL:
  1443. FWAIT ;
  1444. END SubARARLoopA;
  1445. PROCEDURE SubAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1446. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1447. MOV RAX, [RBP+len] ;
  1448. CMP RAX, 0 ;
  1449. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1450. MOV RBX, [RBP+ladr] ;
  1451. MOV RCX, [RBP+radr] ;
  1452. MOV RDX, [RBP+dadr] ;
  1453. ; check IF data are contiguous IN memory
  1454. CMP [RBP+linc], 8 ; check left FOR contiunuity
  1455. JNE single ; not continuous- > simplest method
  1456. CMP [RBP+rinc], 8 ; check right FOR contiunuity
  1457. JNE single ; not continuous- > simplest method
  1458. CMP [RBP+dinc], 8 ; check destination FOR contiunuity
  1459. JNE single ; not continuous- > simplest method
  1460. ; check FOR alignment
  1461. MOV RSI, RBX ;
  1462. AND RSI, 7 ; ladr MOD 8
  1463. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1464. JNE unaligned ; not 64 bit aligned
  1465. MOV RSI, RCX ;
  1466. AND RSI, 7 ; radr MOD 8
  1467. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1468. JNE unaligned ; not 64 bit aligned
  1469. MOV RSI, RDX ;
  1470. AND RSI, 7 ; dadr MOD 8
  1471. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1472. JNE unaligned ; not 64 bit aligned
  1473. MOV RSI, RBX ;
  1474. AND RSI, 8 ; 16 byte alignment
  1475. MOV RDI, RCX ;
  1476. AND RDI, 8 ; 16 byte alignment
  1477. CMP RSI, RDI ;
  1478. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1479. MOV RDI, RDX ;
  1480. AND RDI, 8 ; 16 byte alignment
  1481. CMP RSI, RDI ;
  1482. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1483. CMP RSI, 8 ;
  1484. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1485. ; one single element processing TO achieve 128 bt alignment
  1486. MOVSD XMM1, [RBX] ;
  1487. MOVSD XMM0, [RCX] ;
  1488. SUBSD XMM0, XMM1 ;
  1489. MOVSD [RDX], XMM0 ;
  1490. ADD RBX, 8 ; now RBX IS 16 byte aligned
  1491. ADD RCX, 8 ; now RDX IS 16 byte aligned ;
  1492. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  1493. DEC RAX ; one element has been processed
  1494. aligned:
  1495. aligned8:
  1496. CMP RAX, 8 ;
  1497. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1498. MOVAPD XMM0, [RBX] ;
  1499. MOVAPD XMM1, [RBX+16] ;
  1500. MOVAPD XMM2, [RBX+32] ;
  1501. MOVAPD XMM3, [RBX+48] ;
  1502. ADD RBX, 64 ;
  1503. MOVAPD XMM4, [RCX] ;
  1504. MOVAPD XMM5, [RCX+16] ;
  1505. MOVAPD XMM6, [RCX+32] ;
  1506. MOVAPD XMM7, [RCX+48] ;
  1507. ADD RCX, 64 ;
  1508. SUBPD XMM0, XMM4 ;
  1509. SUBPD XMM1, XMM5 ;
  1510. SUBPD XMM2, XMM6 ;
  1511. SUBPD XMM3, XMM7 ;
  1512. MOVAPD [RDX], XMM0 ;
  1513. MOVAPD [RDX+16], XMM1 ;
  1514. MOVAPD [RDX+32], XMM2 ;
  1515. MOVAPD [RDX+48], XMM3 ;
  1516. ADD RDX, 64 ;
  1517. SUB RAX, 8 ;
  1518. JMP aligned8 ;
  1519. ; LOOP FOR 2 pieces aligned
  1520. aligned2: ;
  1521. CMP RAX, 2 ;
  1522. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1523. MOVAPD XMM0, [RBX] ;
  1524. ADD RBX, 16 ;
  1525. MOVAPD XMM1, [RCX] ;
  1526. ADD RCX, 16 ;
  1527. SUBPD XMM0, XMM1 ;
  1528. MOVAPD [RDX], XMM0 ;
  1529. ADD RDX, 16 ;
  1530. SUB RAX, 2 ;
  1531. JMP aligned2 ;
  1532. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1533. unaligned: ;
  1534. unaligned8: ;
  1535. CMP RAX, 8 ;
  1536. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1537. MOVUPD XMM0, [RBX] ;
  1538. MOVUPD XMM1, [RBX+16] ;
  1539. MOVUPD XMM2, [RBX+32] ;
  1540. MOVUPD XMM3, [RBX+48] ;
  1541. ADD RBX, 64 ;
  1542. MOVUPD XMM4, [RCX] ;
  1543. MOVUPD XMM5, [RCX+16] ;
  1544. MOVUPD XMM6, [RCX+32] ;
  1545. MOVUPD XMM7, [RCX+48] ;
  1546. ADD RCX, 64 ;
  1547. SUBPD XMM0, XMM4 ;
  1548. SUBPD XMM1, XMM5 ;
  1549. SUBPD XMM2, XMM6 ;
  1550. SUBPD XMM3, XMM7 ;
  1551. MOVUPD [RDX], XMM0 ;
  1552. MOVUPD [RDX+16], XMM1 ;
  1553. MOVUPD [RDX+32], XMM2 ;
  1554. MOVUPD [RDX+48], XMM3 ;
  1555. ADD RDX, 64 ;
  1556. SUB RAX, 8 ;
  1557. JMP unaligned8 ;
  1558. ; LOOP FOR 2 pieces aligned
  1559. unaligned2: ;
  1560. CMP RAX, 2 ;
  1561. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1562. MOVUPD XMM0, [RBX] ;
  1563. ADD RBX, 16 ;
  1564. MOVUPD XMM1, [RCX] ;
  1565. ADD RCX, 16 ;
  1566. SUBPD XMM0, XMM1 ;
  1567. MOVUPD [RDX], XMM0 ;
  1568. ADD RDX, 16 ;
  1569. SUB RAX, 2 ;
  1570. JMP unaligned2 ;
  1571. ; one piece left OR non-contiguous data
  1572. single:
  1573. singlepieces: ;
  1574. CMP RAX, 0 ;
  1575. JLE endL ; len <= 0- > EXIT
  1576. MOVSD XMM0, [RBX]
  1577. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1578. MOVSD XMM1, [RCX]
  1579. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1580. SUBSD XMM0, XMM1 ;
  1581. MOVSD [RDX], XMM0
  1582. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1583. DEC RAX ; DEC(len)
  1584. JMP singlepieces ;
  1585. endL:
  1586. END SubAXAXLoopSSE;
  1587. PROCEDURE SubARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1588. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1589. MOV RAX, [RBP+len] ;
  1590. CMP RAX, 0 ;
  1591. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1592. MOV RBX, [RBP+ladr] ;
  1593. MOV RCX, [RBP+radr] ;
  1594. MOV RDX, [RBP+dadr] ;
  1595. ; check IF data are contiguous IN memory
  1596. CMP [RBP+linc], 4 ; check left FOR contiunuity
  1597. JNE single ; not continuous- > simplest method
  1598. CMP [RBP+rinc], 4 ; check right FOR contiunuity
  1599. JNE single ; not continuous- > simplest method
  1600. CMP [RBP+dinc], 4 ; check destination FOR contiunuity
  1601. JNE single ; not continuous- > simplest method
  1602. ; check FOR alignment
  1603. MOV RSI, RBX ;
  1604. AND RSI, 3 ; ladr MOD 4
  1605. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1606. JNE unaligned ; not 32 bit aligned
  1607. MOV RSI, RCX ;
  1608. AND RSI, 3 ; radr MOD 4
  1609. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1610. JNE unaligned ; not 32 bit aligned
  1611. MOV RSI, RDX ;
  1612. AND RSI, 3 ; dadr MOD 4
  1613. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1614. JNE unaligned ; not 32 bit aligned
  1615. MOV RSI, RBX ;
  1616. AND RSI, 8+4 ; 16 byte alignment?
  1617. MOV RDI, RCX ;
  1618. AND RDI, 8+4 ; 16 byte alignment?
  1619. CMP RSI, RDI ;
  1620. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1621. MOV RDI, RDX ;
  1622. AND RDI, 8+4 ; 16 byte alignment
  1623. CMP RSI, RDI ;
  1624. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1625. CMP RSI, 0 ;
  1626. JE aligned ; already aligned
  1627. align:
  1628. ; one single element processing UNTIL 128 bt alignment achieved
  1629. MOVSS XMM1, [RBX] ;
  1630. MOVSS XMM0, [RCX] ;
  1631. SUBSS XMM0, XMM1 ;
  1632. MOVSS [RDX], XMM0 ;
  1633. ADD RBX, 4 ;
  1634. ADD RCX, 4 ;
  1635. ADD RDX, 4 ;
  1636. DEC RAX ; one element has been processed ;
  1637. CMP RAX, 0 ; all elements already processed?
  1638. JLE single ;
  1639. MOV RSI, RBX ;
  1640. AND RSI, 8+4 ;
  1641. CMP RSI, 0 ;
  1642. JNE align ;
  1643. aligned:
  1644. aligned16:
  1645. CMP RAX, 16 ;
  1646. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1647. MOVAPS XMM0, [RBX] ;
  1648. MOVAPS XMM1, [RBX+16] ;
  1649. MOVAPS XMM2, [RBX+32] ;
  1650. MOVAPS XMM3, [RBX+48] ;
  1651. ADD RBX, 64 ;
  1652. MOVAPS XMM4, [RCX] ;
  1653. MOVAPS XMM5, [RCX+16] ;
  1654. MOVAPS XMM6, [RCX+32] ;
  1655. MOVAPS XMM7, [RCX+48] ;
  1656. ADD RCX, 64 ;
  1657. SUBPS XMM0, XMM4 ;
  1658. SUBPS XMM1, XMM5 ;
  1659. SUBPS XMM2, XMM6 ;
  1660. SUBPS XMM3, XMM7 ;
  1661. MOVAPS [RDX], XMM0 ;
  1662. MOVAPS [RDX+16], XMM1 ;
  1663. MOVAPS [RDX+32], XMM2 ;
  1664. MOVAPS [RDX+48], XMM3 ;
  1665. ADD RDX, 64 ;
  1666. SUB RAX, 16 ;
  1667. JMP aligned16 ;
  1668. ; LOOP FOR 2 pieces aligned
  1669. aligned4: ;
  1670. CMP RAX, 4 ;
  1671. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1672. MOVAPS XMM0, [RBX] ;
  1673. ADD RBX, 16 ;
  1674. MOVAPS XMM1, [RCX] ;
  1675. ADD RCX, 16 ;
  1676. SUBPS XMM0, XMM1 ;
  1677. MOVAPS [RDX], XMM0 ;
  1678. ADD RDX, 16 ;
  1679. SUB RAX, 4 ;
  1680. JMP aligned4 ;
  1681. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1682. unaligned: ;
  1683. unaligned16: ;
  1684. CMP RAX, 16 ;
  1685. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1686. MOVUPS XMM0, [RBX] ;
  1687. MOVUPS XMM1, [RBX+16] ;
  1688. MOVUPS XMM2, [RBX+32] ;
  1689. MOVUPS XMM3, [RBX+48] ;
  1690. ADD RBX, 64 ;
  1691. MOVUPS XMM4, [RCX] ;
  1692. MOVUPS XMM5, [RCX+16] ;
  1693. MOVUPS XMM6, [RCX+32] ;
  1694. MOVUPS XMM7, [RCX+48] ;
  1695. ADD RCX, 64 ;
  1696. SUBPS XMM0, XMM4 ;
  1697. SUBPS XMM1, XMM5 ;
  1698. SUBPS XMM2, XMM6 ;
  1699. SUBPS XMM3, XMM7 ;
  1700. MOVUPS [RDX], XMM0 ;
  1701. MOVUPS [RDX+16], XMM1 ;
  1702. MOVUPS [RDX+32], XMM2 ;
  1703. MOVUPS [RDX+48], XMM3 ;
  1704. ADD RDX, 64 ;
  1705. SUB RAX, 16 ;
  1706. JMP unaligned16 ;
  1707. ; LOOP FOR 2 pieces aligned
  1708. unaligned4: ;
  1709. CMP RAX, 4 ;
  1710. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1711. MOVUPS XMM0, [RBX] ;
  1712. ADD RBX, 16 ;
  1713. MOVUPS XMM1, [RCX] ;
  1714. ADD RCX, 16 ;
  1715. SUBPS XMM0, XMM1 ;
  1716. MOVUPS [RDX], XMM0 ;
  1717. ADD RDX, 16 ;
  1718. SUB RAX, 4 ;
  1719. JMP unaligned4 ;
  1720. ; one piece left OR non-contiguous data
  1721. single:
  1722. singlepieces: ;
  1723. CMP RAX, 0 ;
  1724. JLE endL ; len <= 0- > EXIT
  1725. MOVSS XMM0, [RBX]
  1726. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1727. MOVSS XMM1, [RCX]
  1728. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1729. SUBSS XMM0, XMM1 ;
  1730. MOVSS [RDX], XMM0
  1731. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1732. DEC RAX ; DEC(len)
  1733. JMP singlepieces ;
  1734. endL:
  1735. END SubARARLoopSSE;
  1736. (* *)
  1737. PROCEDURE EMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1738. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1739. MOV RAX, [RBP+len] ;
  1740. MOV RBX, [RBP+ladr] ;
  1741. MOV RCX, [RBP+radr] ;
  1742. MOV RDX, [RBP+dadr] ;
  1743. start:
  1744. CMP RAX, 0 ;
  1745. JLE endL ;
  1746. FLD QWORD [RBX] ;
  1747. ADD RBX, [RBP+linc] ;
  1748. FLD QWORD [RCX] ;
  1749. ADD RCX, [RBP+rinc] ;
  1750. FMULP ;
  1751. FSTP QWORD [RDX] ;
  1752. ADD RDX, [RBP+dinc] ;
  1753. DEC RAX ;
  1754. JMP start ;
  1755. endL:
  1756. FWAIT ;
  1757. END EMulAXAXLoopA;
  1758. PROCEDURE EMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1759. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1760. MOV RAX, [RBP+len] ;
  1761. MOV RBX, [RBP+ladr] ;
  1762. MOV RCX, [RBP+radr] ;
  1763. MOV RDX, [RBP+dadr] ;
  1764. start:
  1765. CMP RAX, 0 ;
  1766. JLE endL ;
  1767. FLD DWORD [RBX] ;
  1768. ADD RBX, [RBP+linc] ;
  1769. FLD DWORD [RCX] ;
  1770. ADD RCX, [RBP+rinc] ;
  1771. FMULP ;
  1772. FSTP DWORD [RDX] ;
  1773. ADD RDX, [RBP+dinc] ;
  1774. DEC RAX ;
  1775. JMP start ;
  1776. endL:
  1777. FWAIT ;
  1778. END EMulARARLoopA;
  1779. PROCEDURE EMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1780. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1781. MOV RAX, [RBP+len] ;
  1782. CMP RAX, 0 ;
  1783. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1784. MOV RBX, [RBP+ladr] ;
  1785. MOV RCX, [RBP+radr] ;
  1786. MOV RDX, [RBP+dadr] ;
  1787. ; check IF data are contiguous IN memory
  1788. CMP [RBP+linc], 8 ; check left FOR contiunuity
  1789. JNE single ; not continuous- > simplest method
  1790. CMP [RBP+rinc], 8 ; check right FOR contiunuity
  1791. JNE single ; not continuous- > simplest method
  1792. CMP [RBP+dinc], 8 ; check destination FOR contiunuity
  1793. JNE single ; not continuous- > simplest method
  1794. ; check FOR alignment
  1795. MOV RSI, RBX ;
  1796. AND RSI, 7 ; ladr MOD 8
  1797. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1798. JNE unaligned ; not 64 bit aligned
  1799. MOV RSI, RCX ;
  1800. AND RSI, 7 ; radr MOD 8
  1801. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1802. JNE unaligned ; not 64 bit aligned
  1803. MOV RSI, RDX ;
  1804. AND RSI, 7 ; dadr MOD 8
  1805. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1806. JNE unaligned ; not 64 bit aligned
  1807. MOV RSI, RBX ;
  1808. AND RSI, 8 ; 16 byte alignment
  1809. MOV RDI, RCX ;
  1810. AND RDI, 8 ; 16 byte alignment
  1811. CMP RSI, RDI ;
  1812. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1813. MOV RDI, RDX ;
  1814. AND RDI, 8 ; 16 byte alignment
  1815. CMP RSI, RDI ;
  1816. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1817. CMP RSI, 8 ;
  1818. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1819. ; one single element processing TO achieve 128 bt alignment
  1820. MOVSD XMM1, [RBX] ;
  1821. MOVSD XMM0, [RCX] ;
  1822. MULSD XMM0, XMM1 ;
  1823. MOVSD [RDX], XMM0 ;
  1824. ADD RBX, 8 ; now RBX IS 16 byte aligned
  1825. ADD RCX, 8 ; now RDX IS 16 byte aligned ;
  1826. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  1827. DEC RAX ; one element has been processed
  1828. aligned:
  1829. aligned8:
  1830. CMP RAX, 8 ;
  1831. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1832. MOVAPD XMM0, [RBX] ;
  1833. MOVAPD XMM1, [RBX+16] ;
  1834. MOVAPD XMM2, [RBX+32] ;
  1835. MOVAPD XMM3, [RBX+48] ;
  1836. ADD RBX, 64 ;
  1837. MOVAPD XMM4, [RCX] ;
  1838. MOVAPD XMM5, [RCX+16] ;
  1839. MOVAPD XMM6, [RCX+32] ;
  1840. MOVAPD XMM7, [RCX+48] ;
  1841. ADD RCX, 64 ;
  1842. MULPD XMM0, XMM4 ;
  1843. MULPD XMM1, XMM5 ;
  1844. MULPD XMM2, XMM6 ;
  1845. MULPD XMM3, XMM7 ;
  1846. MOVAPD [RDX], XMM0 ;
  1847. MOVAPD [RDX+16], XMM1 ;
  1848. MOVAPD [RDX+32], XMM2 ;
  1849. MOVAPD [RDX+48], XMM3 ;
  1850. ADD RDX, 64 ;
  1851. SUB RAX, 8 ;
  1852. JMP aligned8 ;
  1853. ; LOOP FOR 2 pieces aligned
  1854. aligned2: ;
  1855. CMP RAX, 2 ;
  1856. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1857. MOVAPD XMM0, [RBX] ;
  1858. ADD RBX, 16 ;
  1859. MOVAPD XMM1, [RCX] ;
  1860. ADD RCX, 16 ;
  1861. MULPD XMM0, XMM1 ;
  1862. MOVAPD [RDX], XMM0 ;
  1863. ADD RDX, 16 ;
  1864. SUB RAX, 2 ;
  1865. JMP aligned2 ;
  1866. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1867. unaligned: ;
  1868. unaligned8: ;
  1869. CMP RAX, 8 ;
  1870. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1871. MOVUPD XMM0, [RBX] ;
  1872. MOVUPD XMM1, [RBX+16] ;
  1873. MOVUPD XMM2, [RBX+32] ;
  1874. MOVUPD XMM3, [RBX+48] ;
  1875. ADD RBX, 64 ;
  1876. MOVUPD XMM4, [RCX] ;
  1877. MOVUPD XMM5, [RCX+16] ;
  1878. MOVUPD XMM6, [RCX+32] ;
  1879. MOVUPD XMM7, [RCX+48] ;
  1880. ADD RCX, 64 ;
  1881. MULPD XMM0, XMM4 ;
  1882. MULPD XMM1, XMM5 ;
  1883. MULPD XMM2, XMM6 ;
  1884. MULPD XMM3, XMM7 ;
  1885. MOVUPD [RDX], XMM0 ;
  1886. MOVUPD [RDX+16], XMM1 ;
  1887. MOVUPD [RDX+32], XMM2 ;
  1888. MOVUPD [RDX+48], XMM3 ;
  1889. ADD RDX, 64 ;
  1890. SUB RAX, 8 ;
  1891. JMP unaligned8 ;
  1892. ; LOOP FOR 2 pieces aligned
  1893. unaligned2: ;
  1894. CMP RAX, 2 ;
  1895. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1896. MOVUPD XMM0, [RBX] ;
  1897. ADD RBX, 16 ;
  1898. MOVUPD XMM1, [RCX] ;
  1899. ADD RCX, 16 ;
  1900. MULPD XMM0, XMM1 ;
  1901. MOVUPD [RDX], XMM0 ;
  1902. ADD RDX, 16 ;
  1903. SUB RAX, 2 ;
  1904. JMP unaligned2 ;
  1905. ; one piece left OR non-contiguous data
  1906. single:
  1907. singlepieces: ;
  1908. CMP RAX, 0 ;
  1909. JLE endL ; len <= 0- > EXIT
  1910. MOVSD XMM0, [RBX]
  1911. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1912. MOVSD XMM1, [RCX]
  1913. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1914. MULSD XMM0, XMM1 ;
  1915. MOVSD [RDX], XMM0
  1916. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1917. DEC RAX ; DEC(len)
  1918. JMP singlepieces ;
  1919. endL:
  1920. END EMulAXAXLoopSSE;
  1921. PROCEDURE EMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1922. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1923. MOV RAX, [RBP+len] ;
  1924. CMP RAX, 0 ;
  1925. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1926. MOV RBX, [RBP+ladr] ;
  1927. MOV RCX, [RBP+radr] ;
  1928. MOV RDX, [RBP+dadr] ;
  1929. ; check IF data are contiguous IN memory
  1930. CMP [RBP+linc], 4 ; check left FOR contiunuity
  1931. JNE single ; not continuous- > simplest method
  1932. CMP [RBP+rinc], 4 ; check right FOR contiunuity
  1933. JNE single ; not continuous- > simplest method
  1934. CMP [RBP+dinc], 4 ; check destination FOR contiunuity
  1935. JNE single ; not continuous- > simplest method
  1936. ; check FOR alignment
  1937. MOV RSI, RBX ;
  1938. AND RSI, 3 ; ladr MOD 4
  1939. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1940. JNE unaligned ; not 32 bit aligned
  1941. MOV RSI, RCX ;
  1942. AND RSI, 3 ; radr MOD 4
  1943. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1944. JNE unaligned ; not 32 bit aligned
  1945. MOV RSI, RDX ;
  1946. AND RSI, 3 ; dadr MOD 4
  1947. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1948. JNE unaligned ; not 32 bit aligned
  1949. MOV RSI, RBX ;
  1950. AND RSI, 8+4 ; 16 byte alignment?
  1951. MOV RDI, RCX ;
  1952. AND RDI, 8+4 ; 16 byte alignment?
  1953. CMP RSI, RDI ;
  1954. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1955. MOV RDI, RDX ;
  1956. AND RDI, 8+4 ; 16 byte alignment
  1957. CMP RSI, RDI ;
  1958. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1959. CMP RSI, 0 ;
  1960. JE aligned ; already aligned
  1961. align:
  1962. ; one single element processing UNTIL 128 bt alignment achieved
  1963. MOVSS XMM1, [RBX] ;
  1964. MOVSS XMM0, [RCX] ;
  1965. MULSS XMM0, XMM1 ;
  1966. MOVSS [RDX], XMM0 ;
  1967. ADD RBX, 4 ;
  1968. ADD RCX, 4 ;
  1969. ADD RDX, 4 ;
  1970. DEC RAX ; one element has been processed ;
  1971. CMP RAX, 0 ; all elements already processed?
  1972. JLE single ;
  1973. MOV RSI, RBX ;
  1974. AND RSI, 8+4 ;
  1975. CMP RSI, 0 ;
  1976. JNE align ;
  1977. aligned:
  1978. aligned16:
  1979. CMP RAX, 16 ;
  1980. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1981. MOVAPS XMM0, [RBX] ;
  1982. MOVAPS XMM1, [RBX+16] ;
  1983. MOVAPS XMM2, [RBX+32] ;
  1984. MOVAPS XMM3, [RBX+48] ;
  1985. ADD RBX, 64 ;
  1986. MOVAPS XMM4, [RCX] ;
  1987. MOVAPS XMM5, [RCX+16] ;
  1988. MOVAPS XMM6, [RCX+32] ;
  1989. MOVAPS XMM7, [RCX+48] ;
  1990. ADD RCX, 64 ;
  1991. MULPS XMM0, XMM4 ;
  1992. MULPS XMM1, XMM5 ;
  1993. MULPS XMM2, XMM6 ;
  1994. MULPS XMM3, XMM7 ;
  1995. MOVAPS [RDX], XMM0 ;
  1996. MOVAPS [RDX+16], XMM1 ;
  1997. MOVAPS [RDX+32], XMM2 ;
  1998. MOVAPS [RDX+48], XMM3 ;
  1999. ADD RDX, 64 ;
  2000. SUB RAX, 16 ;
  2001. JMP aligned16 ;
  2002. ; LOOP FOR 2 pieces aligned
  2003. aligned4: ;
  2004. CMP RAX, 4 ;
  2005. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2006. MOVAPS XMM0, [RBX] ;
  2007. ADD RBX, 16 ;
  2008. MOVAPS XMM1, [RCX] ;
  2009. ADD RCX, 16 ;
  2010. MULPS XMM0, XMM1 ;
  2011. MOVAPS [RDX], XMM0 ;
  2012. ADD RDX, 16 ;
  2013. SUB RAX, 4 ;
  2014. JMP aligned4 ;
  2015. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2016. unaligned: ;
  2017. unaligned16: ;
  2018. CMP RAX, 16 ;
  2019. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  2020. MOVUPS XMM0, [RBX] ;
  2021. MOVUPS XMM1, [RBX+16] ;
  2022. MOVUPS XMM2, [RBX+32] ;
  2023. MOVUPS XMM3, [RBX+48] ;
  2024. ADD RBX, 64 ;
  2025. MOVUPS XMM4, [RCX] ;
  2026. MOVUPS XMM5, [RCX+16] ;
  2027. MOVUPS XMM6, [RCX+32] ;
  2028. MOVUPS XMM7, [RCX+48] ;
  2029. ADD RCX, 64 ;
  2030. MULPS XMM0, XMM4 ;
  2031. MULPS XMM1, XMM5 ;
  2032. MULPS XMM2, XMM6 ;
  2033. MULPS XMM3, XMM7 ;
  2034. MOVUPS [RDX], XMM0 ;
  2035. MOVUPS [RDX+16], XMM1 ;
  2036. MOVUPS [RDX+32], XMM2 ;
  2037. MOVUPS [RDX+48], XMM3 ;
  2038. ADD RDX, 64 ;
  2039. SUB RAX, 16 ;
  2040. JMP unaligned16 ;
  2041. ; LOOP FOR 2 pieces aligned
  2042. unaligned4: ;
  2043. CMP RAX, 4 ;
  2044. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2045. MOVUPS XMM0, [RBX] ;
  2046. ADD RBX, 16 ;
  2047. MOVUPS XMM1, [RCX] ;
  2048. ADD RCX, 16 ;
  2049. MULPS XMM0, XMM1 ;
  2050. MOVUPS [RDX], XMM0 ;
  2051. ADD RDX, 16 ;
  2052. SUB RAX, 4 ;
  2053. JMP unaligned4 ;
  2054. ; one piece left OR non-contiguous data
  2055. single:
  2056. singlepieces: ;
  2057. CMP RAX, 0 ;
  2058. JLE endL ; len <= 0- > EXIT
  2059. MOVSS XMM0, [RBX]
  2060. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2061. MOVSS XMM1, [RCX]
  2062. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  2063. MULSS XMM0, XMM1 ;
  2064. MOVSS [RDX], XMM0
  2065. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2066. DEC RAX ; DEC(len)
  2067. JMP singlepieces ;
  2068. endL:
  2069. END EMulARARLoopSSE;
  2070. PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  2071. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  2072. MOV RAX, [RBP+len] ; eax := len
  2073. MOV RBX, [RBP+ladr] ; ebx := ladr
  2074. MOV RCX, [RBP+radr] ; ecx := radr
  2075. MOV RDX, [RBP+dadr] ; edx := dadr
  2076. FLD QWORD [RDX] ; S.GET(dadr, x)
  2077. start:
  2078. CMP RAX, 0 ; WHILE len > 0 DO
  2079. JLE endL
  2080. FLD QWORD [RBX] ; S.GET(ladr, x)
  2081. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2082. FLD QWORD [RCX] ; S.GET(ladr, y)
  2083. FMULP ; x := x*y
  2084. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  2085. FADDP ; z := z+x
  2086. DEC RAX ; DEC(len)
  2087. JMP start ;
  2088. endL:
  2089. FSTP QWORD [RDX] ; S.PUT(dadr, x)
  2090. FWAIT ;
  2091. END SPAXAXLoopA;
  2092. PROCEDURE SPARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  2093. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  2094. MOV RAX, [RBP+len] ; eax := len
  2095. MOV RBX, [RBP+ladr] ; ebx := ladr
  2096. MOV RCX, [RBP+radr] ; ecx := radr
  2097. MOV RDX, [RBP+dadr] ; edx := dadr
  2098. FLD DWORD [RDX] ; S.GET(dadr, x)
  2099. start:
  2100. CMP RAX, 0 ; WHILE len > 0 DO
  2101. JLE endL
  2102. FLD DWORD [RBX] ; S.GET(ladr, x)
  2103. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2104. FLD DWORD [RCX] ; S.GET(ladr, y)
  2105. FMULP ; x := x*y
  2106. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  2107. FADDP ; z := z+x
  2108. DEC RAX ; DEC(len)
  2109. JMP start ;
  2110. endL:
  2111. FSTP DWORD [RDX] ; S.PUT(dadr, x)
  2112. FWAIT ;
  2113. END SPARARLoopA;
  2114. (* sse version of scalar product *)
  2115. PROCEDURE SPAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  2116. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2117. ; register initialization
  2118. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2119. CMP RAX, 0 ;
  2120. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2121. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2122. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  2123. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2124. XORPD XMM0, XMM0 ;
  2125. MOVSD XMM0, [RDX] ; destination- > low bytes OF xmm0
  2126. CMP [RBP+linc], 8 ; check left FOR contiunuity
  2127. JNE single ; not continuous- > simplest method
  2128. CMP [RBP+rinc], 8 ; check dest FOR continuity
  2129. JNE single ; not continuous- > simplest method
  2130. ; check FOR alignment
  2131. MOV RSI, RBX ;
  2132. AND RSI, 7 ; ladr MOD 8
  2133. CMP RSI, 0 ; RCX = 0- > 64 Bit alignment
  2134. JNE unaligned ; not 64 bit aligned
  2135. MOV RSI, RCX ;
  2136. AND RSI, 7 ; radr MOD 8
  2137. CMP RSI, 0 ; = 0- > 64 Bit alignment
  2138. JNE unaligned ; not 64 bit aligned
  2139. MOV RSI, RBX ;
  2140. AND RSI, 8 ; 16 byte alignment
  2141. MOV RDI, RCX ;
  2142. AND RDI, 8 ; 16 byte alignment
  2143. CMP RSI, RDI ;
  2144. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2145. CMP RSI, 8 ;
  2146. JNE aligned ; ladr and dadr already 128 bit aligned
  2147. ; one single element processing TO achieve 128 bt alignment
  2148. MOVSD XMM1, [RBX] ;
  2149. MOVSD XMM2, [RCX] ;
  2150. MULSD XMM1, XMM2 ;
  2151. ADDSD XMM0, XMM1 ;
  2152. ADD RBX, 8 ; now RBX IS 16 byte aligned
  2153. ADD RCX, 8 ; now RDX IS 16 byte aligned ;
  2154. DEC RAX ; one element has been processed
  2155. ; LOOP FOR 4 pieces aligned
  2156. aligned:
  2157. aligned6:
  2158. CMP RAX, 6 ;
  2159. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2160. MOVAPD XMM1, [RBX] ;
  2161. MOVAPD XMM2, [RBX+16] ;
  2162. MOVAPD XMM3, [RBX+32] ;
  2163. MOVAPD XMM4, [RCX] ;
  2164. MOVAPD XMM5, [RCX+16] ;
  2165. MOVAPD XMM6, [RCX+32] ;
  2166. MULPD XMM1, XMM4 ;
  2167. ADDPD XMM0, XMM1 ;
  2168. MULPD XMM2, XMM5 ;
  2169. ADDPD XMM0, XMM2 ;
  2170. MULPD XMM3, XMM6 ;
  2171. ADDPD XMM0, XMM3 ;
  2172. ADD RBX, 48 ;
  2173. ADD RCX, 48 ;
  2174. SUB RAX, 6 ;
  2175. JMP aligned6 ;
  2176. ; LOOP FOR 2 pieces aligned
  2177. aligned2:
  2178. CMP RAX, 2 ;
  2179. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2180. MOVAPD XMM1, [RBX] ;
  2181. MOVAPD XMM2, [RCX] ;
  2182. MULPD XMM1, XMM2 ;
  2183. ADDPD XMM0, XMM1 ;
  2184. ADD RBX, 16 ;
  2185. ADD RCX, 16 ;
  2186. SUB RAX, 2 ;
  2187. JMP aligned2 ;
  2188. unaligned:
  2189. unaligned6:
  2190. CMP RAX, 6 ;
  2191. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  2192. MOVUPD XMM1, [RBX] ;
  2193. MOVUPD XMM2, [RBX+16] ;
  2194. MOVUPD XMM3, [RBX+32] ;
  2195. MOVUPD XMM4, [RCX] ;
  2196. MOVUPD XMM5, [RCX+16] ;
  2197. MOVUPD XMM6, [RCX+32] ;
  2198. MULPD XMM1, XMM4 ;
  2199. ADDPD XMM0, XMM1 ;
  2200. MULPD XMM2, XMM5 ;
  2201. ADDPD XMM0, XMM2 ;
  2202. MULPD XMM3, XMM6 ;
  2203. ADDPD XMM0, XMM3 ;
  2204. ADD RBX, 48 ;
  2205. ADD RCX, 48 ;
  2206. SUB RAX, 6 ;
  2207. JMP unaligned6 ;
  2208. ; LOOP FOR 2 pieces aligned
  2209. unaligned2:
  2210. CMP RAX, 2 ;
  2211. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2212. MOVUPD XMM1, [RBX] ;
  2213. MOVUPD XMM2, [RCX] ;
  2214. MULPD XMM1, XMM2 ;
  2215. ADDPD XMM0, XMM1 ;
  2216. ADD RBX, 16 ;
  2217. ADD RCX, 16 ;
  2218. SUB RAX, 2 ;
  2219. JMP unaligned2 ;
  2220. horizontaladd: ;
  2221. MOVAPD XMM1, XMM0 ;
  2222. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2223. ADDPD XMM0, XMM1 ;
  2224. JMP singlepieces ;
  2225. single:
  2226. singlepieces: ;
  2227. CMP RAX, 0 ;
  2228. JLE store ; len <= 0- > EXIT
  2229. MOVSD XMM1, [RBX]
  2230. MOVSD XMM2, [RCX]
  2231. MULSD XMM1, XMM2
  2232. ADDSD XMM0, XMM1
  2233. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2234. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  2235. DEC RAX ; DEC(len)
  2236. JMP singlepieces ;
  2237. store:
  2238. MOVSD [RDX], XMM0 ;
  2239. endL:
  2240. END SPAXAXLoopSSE;
  2241. (* sse version of scalar product *)
  2242. PROCEDURE SPARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  2243. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  2244. ; register initialization
  2245. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2246. CMP RAX, 0 ;
  2247. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2248. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2249. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  2250. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2251. XORPS XMM0, XMM0 ;
  2252. MOVSS XMM0, [RDX] ; destination- > low bytes OF xmm0
  2253. CMP [RBP+linc], 4 ; check left FOR contiunuity
  2254. JNE single ; not continuous- > simplest method
  2255. CMP [RBP+rinc], 4 ; check dest FOR continuity
  2256. JNE single ; not continuous- > simplest method
  2257. ; check FOR alignment
  2258. MOV RSI, RBX ;
  2259. AND RSI, 3 ; ladr MOD 4
  2260. CMP RSI, 0 ; RCX = 0- > 32 Bit alignment
  2261. JNE unaligned ; not 32 bit aligned
  2262. MOV RSI, RCX ;
  2263. AND RSI, 3 ; radr MOD 4
  2264. CMP RSI, 0 ; = 0- > 32 Bit alignment
  2265. JNE unaligned ; not 32 bit aligned
  2266. MOV RSI, RBX ;
  2267. AND RSI, 8+4 ; 16 byte alignment
  2268. MOV RDI, RCX ;
  2269. AND RDI, 8+4 ; 16 byte alignment
  2270. CMP RSI, RDI ;
  2271. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2272. CMP RSI, 0 ;
  2273. JE aligned ; already aligned
  2274. align:
  2275. ; one single element processing UNTIL 128 bt alignment achieved
  2276. MOVSS XMM1, [RBX] ;
  2277. MOVSS XMM2, [RCX] ;
  2278. MULSS XMM1, XMM2 ;
  2279. ADDSS XMM0, XMM1 ;
  2280. ADD RBX, 4 ;
  2281. ADD RCX, 4 ;
  2282. DEC RAX ; one element has been processed ;
  2283. CMP RAX, 0 ; all elements already processed?
  2284. JLE single ;
  2285. MOV RSI, RBX ;
  2286. AND RSI, 8+4 ;
  2287. CMP RSI, 0 ;
  2288. JNE align ;
  2289. aligned:
  2290. aligned12:
  2291. CMP RAX, 12 ;
  2292. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2293. MOVAPS XMM1, [RBX] ;
  2294. MOVAPS XMM2, [RBX+16] ;
  2295. MOVAPS XMM3, [RBX+32] ;
  2296. MOVAPS XMM4, [RCX] ;
  2297. MOVAPS XMM5, [RCX+16] ;
  2298. MOVAPS XMM6, [RCX+32] ;
  2299. MULPS XMM1, XMM4 ;
  2300. ADDPS XMM0, XMM1 ;
  2301. MULPS XMM2, XMM5 ;
  2302. ADDPS XMM0, XMM2 ;
  2303. MULPS XMM3, XMM6 ;
  2304. ADDPS XMM0, XMM3 ;
  2305. ADD RBX, 48 ;
  2306. ADD RCX, 48 ;
  2307. SUB RAX, 12 ;
  2308. JMP aligned12 ;
  2309. ; LOOP FOR 2 pieces aligned
  2310. aligned4:
  2311. CMP RAX, 4 ;
  2312. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2313. MOVAPS XMM1, [RBX] ;
  2314. MOVAPS XMM2, [RCX] ;
  2315. MULPS XMM1, XMM2 ;
  2316. ADDPS XMM0, XMM1 ;
  2317. ADD RBX, 16 ;
  2318. ADD RCX, 16 ;
  2319. SUB RAX, 4 ;
  2320. JMP aligned4 ;
  2321. unaligned:
  2322. unaligned12:
  2323. CMP RAX, 12 ;
  2324. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  2325. MOVUPS XMM1, [RBX] ;
  2326. MOVUPS XMM2, [RBX+16] ;
  2327. MOVUPS XMM3, [RBX+32] ;
  2328. MOVUPS XMM4, [RCX] ;
  2329. MOVUPS XMM5, [RCX+16] ;
  2330. MOVUPS XMM6, [RCX+32] ;
  2331. MULPS XMM1, XMM4 ;
  2332. ADDPS XMM0, XMM1 ;
  2333. MULPS XMM2, XMM5 ;
  2334. ADDPS XMM0, XMM2 ;
  2335. MULPS XMM3, XMM6 ;
  2336. ADDPS XMM0, XMM3 ;
  2337. ADD RBX, 48 ;
  2338. ADD RCX, 48 ;
  2339. SUB RAX, 12 ;
  2340. JMP unaligned12 ;
  2341. ; LOOP FOR 2 pieces aligned
  2342. unaligned4:
  2343. CMP RAX, 4 ;
  2344. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2345. MOVUPS XMM1, [RBX] ;
  2346. MOVUPS XMM2, [RCX] ;
  2347. MULPS XMM1, XMM2 ;
  2348. ADDPS XMM0, XMM1 ;
  2349. ADD RBX, 16 ;
  2350. ADD RCX, 16 ;
  2351. SUB RAX, 4 ;
  2352. JMP unaligned4 ;
  2353. horizontaladd: ;
  2354. MOVAPS XMM1, XMM0 ;
  2355. ; 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *)
  2356. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  2357. ADDPS XMM1, XMM0 ;
  2358. MOVAPS XMM0, XMM1
  2359. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  2360. ADDPS XMM0, XMM1 ;
  2361. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  2362. JMP singlepieces ;
  2363. single:
  2364. singlepieces: ;
  2365. CMP RAX, 0 ;
  2366. JLE store ; len <= 0- > EXIT
  2367. MOVSS XMM1, [RBX]
  2368. MOVSS XMM2, [RCX]
  2369. MULSS XMM1, XMM2
  2370. ADDSS XMM0, XMM1
  2371. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2372. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  2373. DEC RAX ; DEC(len)
  2374. JMP singlepieces ;
  2375. store:
  2376. MOVSS [RDX], XMM0 ;
  2377. endL:
  2378. END SPARARLoopSSE;
  2379. PROCEDURE MulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2380. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  2381. MOV RAX, [RBP+len] ; eax := len
  2382. MOV RBX, [RBP+ladr] ; ebx := ladr
  2383. MOV RCX, [RBP+radr] ; ecx := radr
  2384. MOV RDX, [RBP+dadr] ; edx := dadr
  2385. start:
  2386. CMP RAX, 0 ; WHILE len > 0 DO
  2387. JLE endL
  2388. FLD QWORD [RBX] ; S.GET(ladr, x)
  2389. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2390. FLD QWORD [RCX] ; S.GET(ladr, y)
  2391. FMULP ; x := x*y
  2392. FSTP QWORD [RDX]
  2393. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2394. DEC RAX ; DEC(len)
  2395. JMP start ;
  2396. endL:
  2397. FWAIT ;
  2398. END MulAXSXLoopA;
  2399. PROCEDURE MulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2400. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  2401. MOV RAX, [RBP+len] ; eax := len
  2402. MOV RBX, [RBP+ladr] ; ebx := ladr
  2403. MOV RCX, [RBP+radr] ; ecx := radr
  2404. MOV RDX, [RBP+dadr] ; edx := dadr
  2405. start:
  2406. CMP RAX, 0 ; WHILE len > 0 DO
  2407. JLE endL
  2408. FLD DWORD [RBX] ; S.GET(ladr, x)
  2409. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2410. FLD DWORD [RCX] ; S.GET(ladr, y)
  2411. FMULP ; x := x*y
  2412. FSTP DWORD [RDX]
  2413. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2414. DEC RAX ; DEC(len)
  2415. JMP start ;
  2416. endL:
  2417. FWAIT ;
  2418. END MulARSRLoopA;
  2419. PROCEDURE IncMulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2420. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  2421. MOV RAX, [RBP+len] ; eax := len
  2422. MOV RBX, [RBP+ladr] ; ebx := ladr
  2423. MOV RCX, [RBP+radr] ; ecx := radr
  2424. MOV RDX, [RBP+dadr] ; edx := dadr
  2425. start:
  2426. CMP RAX, 0 ; WHILE len > 0 DO
  2427. JLE endL
  2428. FLD QWORD [RBX] ; S.GET(ladr, x)
  2429. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2430. FLD QWORD [RCX] ; S.GET(ladr, y)
  2431. FMULP ; x := x*y
  2432. FLD QWORD [RDX+8] ;
  2433. FADDP ;
  2434. FSTP QWORD [RDX]
  2435. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2436. DEC RAX ; DEC(len)
  2437. JMP start ;
  2438. endL:
  2439. FWAIT ;
  2440. END IncMulAXSXLoopA;
  2441. PROCEDURE IncMulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2442. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  2443. MOV RAX, [RBP+len] ; eax := len
  2444. MOV RBX, [RBP+ladr] ; ebx := ladr
  2445. MOV RCX, [RBP+radr] ; ecx := radr
  2446. MOV RDX, [RBP+dadr] ; edx := dadr
  2447. start:
  2448. CMP RAX, 0 ; WHILE len > 0 DO
  2449. JLE endL
  2450. FLD DWORD [RBX] ; S.GET(ladr, x)
  2451. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2452. FLD DWORD [RCX] ; S.GET(ladr, y)
  2453. FMULP ; x := x*y
  2454. FLD DWORD [RDX+8] ;
  2455. FADDP ;
  2456. FSTP DWORD [RDX]
  2457. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2458. DEC RAX ; DEC(len)
  2459. JMP start ;
  2460. endL:
  2461. FWAIT ;
  2462. END IncMulARSRLoopA;
  2463. PROCEDURE MulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2464. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2465. (*
  2466. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2467. 2.) process starting unaligned data ( using single instructions)
  2468. 3.) process aligned data
  2469. 4.) process remaining unaligned data (using single instructions)
  2470. *)
  2471. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2472. ; register initialization
  2473. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2474. CMP RAX, 0 ;
  2475. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2476. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2477. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2478. MOV RCX, [RBP+radr] ;
  2479. MOVSD XMM0, [RCX] ;
  2480. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2481. ; check IF data are contiguous IN memory
  2482. CMP [RBP+linc], 8 ; check left FOR contiunuity
  2483. JNE single ; not continuous- > simplest method
  2484. CMP [RBP+dinc], 8 ; check dest FOR continuity
  2485. JNE single ; not continuous- > simplest method
  2486. ; check FOR alignment
  2487. MOV RCX, RBX ;
  2488. AND RCX, 7 ; ladr MOD 8
  2489. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  2490. JNE unaligned ; not 64 bit aligned
  2491. MOV RCX, RDX ;
  2492. AND RCX, 7 ; dadr MOD 8
  2493. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  2494. JNE unaligned ; not 64 bit aligned
  2495. MOV RSI, RBX ;
  2496. AND RSI, 8 ; 16 byte alignment
  2497. MOV RDI, RDX ;
  2498. AND RDI, 8 ; 16 byte alignment
  2499. CMP RSI, RDI ;
  2500. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2501. CMP RSI, 8 ;
  2502. JNE aligned ; ladr and dadr already 128 bit aligned
  2503. ; one single element processing TO achieve 128 bt alignment
  2504. MOVSD XMM1, [RBX] ;
  2505. MULSD XMM1, XMM0 ;
  2506. MOVSD [RDX], XMM1 ;
  2507. ADD RBX, 8 ; now RBX IS 16 byte aligned
  2508. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  2509. DEC RAX ; one element has been processed
  2510. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2511. aligned:
  2512. aligned8:
  2513. CMP RAX, 8 ;
  2514. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2515. MOVAPD XMM1, [RBX] ;
  2516. MOVAPD XMM2, [RBX+16] ;
  2517. MOVAPD XMM3, [RBX+32] ;
  2518. MOVAPD XMM4, [RBX+48] ;
  2519. ADD RBX, 64 ;
  2520. MULPD XMM1, XMM0 ;
  2521. MULPD XMM2, XMM0 ;
  2522. MULPD XMM3, XMM0 ;
  2523. MULPD XMM4, XMM0 ;
  2524. MOVAPD [RDX], XMM1 ;
  2525. MOVAPD [RDX+16], XMM2 ;
  2526. MOVAPD [RDX+32], XMM3 ;
  2527. MOVAPD [RDX+48], XMM4 ;
  2528. ADD RDX, 64 ;
  2529. SUB RAX, 8 ;
  2530. JMP aligned8 ;
  2531. ; LOOP FOR 2 pieces aligned
  2532. aligned2: ;
  2533. CMP RAX, 2 ;
  2534. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2535. MOVAPD XMM1, [RBX] ;
  2536. ADD RBX, 16 ;
  2537. MULPD XMM1, XMM0 ;
  2538. MOVAPD [RDX], XMM1 ;
  2539. ADD RDX, 16 ;
  2540. SUB RAX, 2 ;
  2541. JMP aligned2 ;
  2542. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2543. unaligned: ;
  2544. unaligned8: ;
  2545. CMP RAX, 8 ;
  2546. JL unaligned2 ; len < 12- > EXIT
  2547. MOVUPD XMM1, [RBX] ;
  2548. MOVUPD XMM2, [RBX+16] ;
  2549. MOVUPD XMM3, [RBX+32] ;
  2550. MOVUPD XMM4, [RBX+48] ;
  2551. ADD RBX, 64
  2552. MULPD XMM1, XMM0 ;
  2553. MULPD XMM2, XMM0 ;
  2554. MULPD XMM3, XMM0 ;
  2555. MULPD XMM4, XMM0 ;
  2556. MOVUPD [RDX], XMM1 ;
  2557. MOVUPD [RDX+16], XMM2 ;
  2558. MOVUPD [RDX+32], XMM3 ;
  2559. MOVUPD [RDX+48], XMM4 ;
  2560. ADD RDX, 64 ;
  2561. SUB RAX, 8 ;
  2562. JMP unaligned8 ;
  2563. ; LOOP FOR 2 pieces unaligned
  2564. unaligned2: ;
  2565. CMP RAX, 2 ;
  2566. JL singlepieces ; len < 2- > EXIT
  2567. MOVUPD XMM1, [RBX] ;
  2568. ADD RBX, 16 ;
  2569. MULPD XMM1, XMM0 ;
  2570. MOVUPD [RDX], XMM1 ;
  2571. ADD RDX, 16 ;
  2572. SUB RAX, 2 ;
  2573. JMP unaligned2 ;
  2574. ; one piece left OR non-contiguous data
  2575. single:
  2576. singlepieces: ;
  2577. CMP RAX, 0 ;
  2578. JLE endL ; len <= 0- > EXIT
  2579. MOVSD XMM1, [RBX]
  2580. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2581. MULSD XMM1, XMM0
  2582. MOVSD [RDX], XMM1
  2583. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2584. DEC RAX ; DEC(len)
  2585. JMP singlepieces ;
  2586. endL:
  2587. END MulAXSXLoopSSE;
  2588. PROCEDURE MulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2589. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2590. (*
  2591. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2592. 2.) process starting unaligned data ( using single instructions)
  2593. 3.) process aligned data
  2594. 4.) process remaining unaligned data (using single instructions)
  2595. *)
  2596. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  2597. ; register initialization
  2598. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2599. CMP RAX, 0 ;
  2600. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2601. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2602. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2603. MOV RCX, [RBP+radr] ;
  2604. MOVSS XMM0, [RCX] ;
  2605. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2606. ; check IF data are contiguous IN memory
  2607. CMP [RBP+linc], 4 ; check left FOR contiunuity
  2608. JNE single ; not continuous- > simplest method
  2609. CMP [RBP+dinc], 4 ; check dest FOR continuity
  2610. JNE single ; not continuous- > simplest method
  2611. ; check FOR alignment
  2612. MOV RCX, RBX ;
  2613. AND RCX, 3 ; ladr MOD 4
  2614. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  2615. JNE unaligned ; not 32 bit aligned
  2616. MOV RCX, RDX ;
  2617. AND RCX, 3 ; dadr MOD 4
  2618. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  2619. JNE unaligned ; not 64 bit aligned
  2620. MOV RSI, RBX ;
  2621. AND RSI, 8+4 ; 16 byte alignment
  2622. MOV RDI, RDX ;
  2623. AND RDI, 8+4 ; 16 byte alignment
  2624. CMP RSI, RDI ;
  2625. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2626. CMP RSI, 0 ;
  2627. JE aligned ; already aligned
  2628. align:
  2629. ; one single element processing UNTIL 128 bt alignment achieved
  2630. MOVSS XMM1, [RBX] ;
  2631. MULSS XMM1, XMM0 ;
  2632. MOVSS [RDX], XMM1 ;
  2633. ADD RBX, 4 ;
  2634. ADD RDX, 4 ;
  2635. DEC RAX ; one element has been processed ;
  2636. CMP RAX, 0 ; all elements already processed?
  2637. JLE single
  2638. MOV RSI, RBX ;
  2639. AND RSI, 8+4 ;
  2640. CMP RSI, 0 ;
  2641. JNE align ;
  2642. aligned:
  2643. aligned16:
  2644. CMP RAX, 16 ;
  2645. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2646. MOVAPS XMM1, [RBX] ;
  2647. MOVAPS XMM2, [RBX+16] ;
  2648. MOVAPS XMM3, [RBX+32] ;
  2649. MOVAPS XMM4, [RBX+48] ;
  2650. ADD RBX, 64 ;
  2651. MULPS XMM1, XMM0 ;
  2652. MULPS XMM2, XMM0 ;
  2653. MULPS XMM3, XMM0 ;
  2654. MULPS XMM4, XMM0 ;
  2655. MOVAPS [RDX], XMM1 ;
  2656. MOVAPS [RDX+16], XMM2 ;
  2657. MOVAPS [RDX+32], XMM3 ;
  2658. MOVAPS [RDX+48], XMM4 ;
  2659. ADD RDX, 64 ;
  2660. SUB RAX, 16 ;
  2661. JMP aligned16 ;
  2662. ; LOOP FOR 2 pieces aligned
  2663. aligned4: ;
  2664. CMP RAX, 4 ;
  2665. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2666. MOVAPS XMM1, [RBX] ;
  2667. ADD RBX, 16 ;
  2668. MULPS XMM1, XMM0 ;
  2669. MOVAPS [RDX], XMM1 ;
  2670. ADD RDX, 16 ;
  2671. SUB RAX, 4 ;
  2672. JMP aligned4 ;
  2673. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2674. unaligned: ;
  2675. unaligned16: ;
  2676. CMP RAX, 16 ;
  2677. JL unaligned4 ; len < 12- > EXIT
  2678. MOVUPS XMM1, [RBX] ;
  2679. MOVUPS XMM2, [RBX+16] ;
  2680. MOVUPS XMM3, [RBX+32] ;
  2681. MOVUPS XMM4, [RBX+48] ;
  2682. ADD RBX, 64
  2683. MULPS XMM1, XMM0 ;
  2684. MULPS XMM2, XMM0 ;
  2685. MULPS XMM3, XMM0 ;
  2686. MULPS XMM4, XMM0 ;
  2687. MOVUPS [RDX], XMM1 ;
  2688. MOVUPS [RDX+16], XMM2 ;
  2689. MOVUPS [RDX+32], XMM3 ;
  2690. MOVUPS [RDX+48], XMM4 ;
  2691. ADD RDX, 64 ;
  2692. SUB RAX, 16 ;
  2693. JMP unaligned16 ;
  2694. ; LOOP FOR 2 pieces unaligned
  2695. unaligned4: ;
  2696. CMP RAX, 4 ;
  2697. JL singlepieces ; len < 2- > EXIT
  2698. MOVUPS XMM1, [RBX] ;
  2699. ADD RBX, 16 ;
  2700. MULPS XMM1, XMM0 ;
  2701. MOVUPS [RDX], XMM1 ;
  2702. ADD RDX, 16 ;
  2703. SUB RAX, 4 ;
  2704. JMP unaligned4 ;
  2705. ; one piece left OR non-contiguous data
  2706. single:
  2707. singlepieces: ;
  2708. CMP RAX, 0 ;
  2709. JLE endL ; len <= 0- > EXIT
  2710. MOVSS XMM1, [RBX]
  2711. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2712. MULSS XMM1, XMM0
  2713. MOVSS [RDX], XMM1
  2714. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2715. DEC RAX ; DEC(len)
  2716. JMP singlepieces ;
  2717. endL:
  2718. END MulARSRLoopSSE;
  2719. PROCEDURE IncMulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2720. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2721. (*
  2722. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2723. 2.) process starting unaligned data ( using single instructions)
  2724. 3.) process aligned data
  2725. 4.) process remaining unaligned data (using single instructions)
  2726. *)
  2727. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2728. ; register initialization
  2729. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2730. CMP RAX, 0 ;
  2731. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2732. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2733. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2734. MOV RCX, [RBP+radr] ;
  2735. MOVSD XMM0, [RCX] ;
  2736. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2737. ; check IF data are contiguous IN memory
  2738. CMP [RBP+linc], 8 ; check left FOR contiunuity
  2739. JNE single ; not continuous- > simplest method
  2740. CMP [RBP+dinc], 8 ; check dest FOR continuity
  2741. JNE single ; not continuous- > simplest method
  2742. ; check FOR alignment
  2743. MOV RCX, RBX ;
  2744. AND RCX, 7 ; ladr MOD 8
  2745. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  2746. JNE unaligned ; not 64 bit aligned
  2747. MOV RCX, RDX ;
  2748. AND RCX, 7 ; dadr MOD 8
  2749. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  2750. JNE unaligned ; not 64 bit aligned
  2751. MOV RSI, RBX ;
  2752. AND RSI, 8 ; 16 byte alignment
  2753. MOV RDI, RDX ;
  2754. AND RDI, 8 ; 16 byte alignment
  2755. CMP RSI, RDI ;
  2756. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2757. CMP RSI, 8 ;
  2758. JNE aligned ; ladr and dadr already 128 bit aligned
  2759. ; one single element processing TO achieve 128 bt alignment
  2760. MOVSD XMM1, [RBX] ;
  2761. MULSD XMM1, XMM0 ;
  2762. MOVSD XMM2, [RDX] ;
  2763. ADDSD XMM1, XMM2 ;
  2764. MOVSD [RDX], XMM1 ;
  2765. ADD RBX, 8 ; now RBX IS 16 byte aligned
  2766. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  2767. DEC RAX ; one element has been processed
  2768. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2769. aligned:
  2770. aligned8:
  2771. CMP RAX, 8 ;
  2772. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2773. MOVAPD XMM1, [RBX] ;
  2774. MOVAPD XMM2, [RBX+16] ;
  2775. MOVAPD XMM3, [RBX+32] ;
  2776. MOVAPD XMM4, [RBX+48] ;
  2777. ADD RBX, 64 ;
  2778. MULPD XMM1, XMM0 ;
  2779. MULPD XMM2, XMM0 ;
  2780. MULPD XMM3, XMM0 ;
  2781. MULPD XMM4, XMM0 ;
  2782. MOVAPD XMM5, [RDX] ;
  2783. ADDPD XMM1, XMM5
  2784. MOVAPD [RDX], XMM1 ;
  2785. MOVAPD XMM6, [RDX+16] ;
  2786. ADDPD XMM2, XMM6
  2787. MOVAPD [RDX+16], XMM2 ;
  2788. MOVAPD XMM7, [RDX+32] ;
  2789. ADDPD XMM3, XMM7
  2790. MOVAPD [RDX+32], XMM3 ;
  2791. MOVAPD XMM5, [RDX+48] ;
  2792. ADDPD XMM4, XMM5
  2793. MOVAPD [RDX+48], XMM4 ;
  2794. ADD RDX, 64 ;
  2795. SUB RAX, 8 ;
  2796. JMP aligned8 ;
  2797. ; LOOP FOR 2 pieces aligned
  2798. aligned2: ;
  2799. CMP RAX, 2 ;
  2800. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2801. MOVAPD XMM1, [RBX] ;
  2802. ADD RBX, 16 ;
  2803. MULPD XMM1, XMM0 ;
  2804. MOVAPD XMM2, [RDX] ;
  2805. ADDPD XMM1, XMM2
  2806. MOVAPD [RDX], XMM1 ;
  2807. ADD RDX, 16 ;
  2808. SUB RAX, 2 ;
  2809. JMP aligned2 ;
  2810. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2811. unaligned: ;
  2812. unaligned8: ;
  2813. CMP RAX, 8 ;
  2814. JL unaligned2 ; len < 12- > EXIT
  2815. MOVUPD XMM1, [RBX] ;
  2816. MOVUPD XMM2, [RBX+16] ;
  2817. MOVUPD XMM3, [RBX+32] ;
  2818. MOVUPD XMM4, [RBX+48] ;
  2819. ADD RBX, 64
  2820. MULPD XMM1, XMM0 ;
  2821. MULPD XMM2, XMM0 ;
  2822. MULPD XMM3, XMM0 ;
  2823. MULPD XMM4, XMM0 ;
  2824. MOVUPD XMM5, [RDX] ;
  2825. ADDPD XMM1, XMM5
  2826. MOVUPD [RDX], XMM1 ;
  2827. MOVUPD XMM6, [RDX+16] ;
  2828. ADDPD XMM2, XMM6
  2829. MOVUPD [RDX+16], XMM2 ;
  2830. MOVUPD XMM7, [RDX+32] ;
  2831. ADDPD XMM3, XMM7
  2832. MOVUPD [RDX+32], XMM3 ;
  2833. MOVUPD XMM5, [RDX+48] ;
  2834. ADDPD XMM4, XMM5
  2835. MOVUPD [RDX+48], XMM4 ;
  2836. ADD RDX, 64 ;
  2837. SUB RAX, 8 ;
  2838. JMP unaligned8 ;
  2839. ; LOOP FOR 2 pieces unaligned
  2840. unaligned2: ;
  2841. CMP RAX, 2 ;
  2842. JL singlepieces ; len < 2- > EXIT
  2843. MOVUPD XMM1, [RBX] ;
  2844. ADD RBX, 16 ;
  2845. MULPD XMM1, XMM0 ;
  2846. MOVUPD XMM2, [RDX] ;
  2847. ADDPD XMM1, XMM2
  2848. MOVUPD [RDX], XMM1 ;
  2849. ADD RDX, 16 ;
  2850. SUB RAX, 2 ;
  2851. JMP unaligned2 ;
  2852. ; one piece left OR non-contiguous data
  2853. single:
  2854. singlepieces: ;
  2855. CMP RAX, 0 ;
  2856. JLE endL ; len <= 0- > EXIT
  2857. MOVSD XMM1, [RBX]
  2858. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2859. MULSD XMM1, XMM0
  2860. MOVSD XMM2, [RDX] ;
  2861. ADDSD XMM1, XMM2
  2862. MOVSD [RDX], XMM1
  2863. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2864. DEC RAX ; DEC(len)
  2865. JMP singlepieces ;
  2866. endL:
  2867. END IncMulAXSXLoopSSE;
  2868. PROCEDURE IncMulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2869. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2870. (*
  2871. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2872. 2.) process starting unaligned data ( using single instructions)
  2873. 3.) process aligned data
  2874. 4.) process remaining unaligned data (using single instructions)
  2875. *)
  2876. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  2877. ; register initialization
  2878. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2879. CMP RAX, 0 ;
  2880. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2881. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2882. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2883. MOV RCX, [RBP+radr] ;
  2884. MOVSS XMM0, [RCX] ;
  2885. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2886. ; check IF data are contiguous IN memory
  2887. CMP [RBP+linc], 4 ; check left FOR contiunuity
  2888. JNE single ; not continuous- > simplest method
  2889. CMP [RBP+dinc], 4 ; check dest FOR continuity
  2890. JNE single ; not continuous- > simplest method
  2891. ; check FOR alignment
  2892. MOV RCX, RBX ;
  2893. AND RCX, 3 ; ladr MOD 4
  2894. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  2895. JNE unaligned ; not 32 bit aligned
  2896. MOV RCX, RDX ;
  2897. AND RCX, 3 ; dadr MOD 4
  2898. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  2899. JNE unaligned ; not 64 bit aligned
  2900. MOV RSI, RBX ;
  2901. AND RSI, 8+4 ; 16 byte alignment
  2902. MOV RDI, RDX ;
  2903. AND RDI, 8+4 ; 16 byte alignment
  2904. CMP RSI, RDI ;
  2905. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2906. CMP RSI, 0 ;
  2907. JE aligned ; already aligned
  2908. align:
  2909. ; one single element processing UNTIL 128 bt alignment achieved
  2910. MOVSS XMM1, [RBX] ;
  2911. MULSS XMM1, XMM0 ;
  2912. MOVSS XMM2, [RDX] ;
  2913. ADDSS XMM1, XMM2 ;
  2914. MOVSS [RDX], XMM1 ;
  2915. ADD RBX, 4 ;
  2916. ADD RDX, 4 ;
  2917. DEC RAX ; one element has been processed ;
  2918. CMP RAX, 0 ; all elements already processed?
  2919. JLE single
  2920. MOV RSI, RBX ;
  2921. AND RSI, 8+4 ;
  2922. CMP RSI, 0 ;
  2923. JNE align ;
  2924. aligned:
  2925. aligned16:
  2926. CMP RAX, 16 ;
  2927. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2928. MOVAPS XMM1, [RBX] ;
  2929. MOVAPS XMM2, [RBX+16] ;
  2930. MOVAPS XMM3, [RBX+32] ;
  2931. MOVAPS XMM4, [RBX+48] ;
  2932. ADD RBX, 64 ;
  2933. MULPS XMM1, XMM0 ;
  2934. MULPS XMM2, XMM0 ;
  2935. MULPS XMM3, XMM0 ;
  2936. MULPS XMM4, XMM0 ;
  2937. MOVAPS XMM5, [RDX] ;
  2938. ADDPS XMM1, XMM5 ;
  2939. MOVAPS [RDX], XMM1 ;
  2940. MOVAPS XMM6, [RDX+16] ;
  2941. ADDPS XMM2, XMM6 ;
  2942. MOVAPS [RDX+16], XMM2 ;
  2943. MOVAPS XMM7, [RDX+32] ;
  2944. ADDPS XMM3, XMM7 ;
  2945. MOVAPS [RDX+32], XMM3 ;
  2946. MOVAPS XMM5, [RDX+48] ;
  2947. ADDPS XMM4, XMM5 ;
  2948. MOVAPS [RDX+48], XMM4 ;
  2949. ADD RDX, 64 ;
  2950. SUB RAX, 16 ;
  2951. JMP aligned16 ;
  2952. ; LOOP FOR 2 pieces aligned
  2953. aligned4: ;
  2954. CMP RAX, 4 ;
  2955. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2956. MOVAPS XMM1, [RBX] ;
  2957. ADD RBX, 16 ;
  2958. MULPS XMM1, XMM0 ;
  2959. MOVAPS XMM2, [RDX] ;
  2960. ADDPS XMM1, XMM2 ;
  2961. MOVAPS [RDX], XMM1 ;
  2962. ADD RDX, 16 ;
  2963. SUB RAX, 4 ;
  2964. JMP aligned4 ;
  2965. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2966. unaligned: ;
  2967. unaligned16: ;
  2968. CMP RAX, 16 ;
  2969. JL unaligned4 ; len < 12- > EXIT
  2970. MOVUPS XMM1, [RBX] ;
  2971. MOVUPS XMM2, [RBX+16] ;
  2972. MOVUPS XMM3, [RBX+32] ;
  2973. MOVUPS XMM4, [RBX+48] ;
  2974. ADD RBX, 64
  2975. MULPS XMM1, XMM0 ;
  2976. MULPS XMM2, XMM0 ;
  2977. MULPS XMM3, XMM0 ;
  2978. MULPS XMM4, XMM0 ;
  2979. MOVUPS XMM5, [RDX] ;
  2980. ADDPS XMM1, XMM5 ;
  2981. MOVUPS [RDX], XMM1 ;
  2982. MOVUPS XMM6, [RDX+16] ;
  2983. ADDPS XMM2, XMM6 ;
  2984. MOVUPS [RDX+16], XMM2 ;
  2985. MOVUPS XMM7, [RDX+32] ;
  2986. ADDPS XMM3, XMM7 ;
  2987. MOVUPS [RDX+32], XMM3 ;
  2988. MOVUPS XMM5, [RDX+48] ;
  2989. ADDPS XMM4, XMM5 ;
  2990. MOVUPS [RDX+48], XMM4 ;
  2991. ADD RDX, 64 ;
  2992. SUB RAX, 16 ;
  2993. JMP unaligned16 ;
  2994. ; LOOP FOR 2 pieces unaligned
  2995. unaligned4: ;
  2996. CMP RAX, 4 ;
  2997. JL singlepieces ; len < 2- > EXIT
  2998. MOVUPS XMM1, [RBX] ;
  2999. ADD RBX, 16 ;
  3000. MULPS XMM1, XMM0 ;
  3001. MOVUPS XMM2, [RDX] ;
  3002. ADDPS XMM1, XMM2 ;
  3003. MOVUPS [RDX], XMM1 ;
  3004. ADD RDX, 16 ;
  3005. SUB RAX, 4 ;
  3006. JMP unaligned4 ;
  3007. ; one piece left OR non-contiguous data
  3008. single:
  3009. singlepieces: ;
  3010. CMP RAX, 0 ;
  3011. JLE endL ; len <= 0- > EXIT
  3012. MOVSS XMM1, [RBX]
  3013. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  3014. MULSS XMM1, XMM0
  3015. MOVSS XMM2, [RDX] ;
  3016. ADDSS XMM1, XMM2 ;
  3017. MOVSS [RDX], XMM1
  3018. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  3019. DEC RAX ; DEC(len)
  3020. JMP singlepieces ;
  3021. endL:
  3022. END IncMulARSRLoopSSE;
  3023. (*
  3024. PROCEDURE AlignedSPXSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  3025. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3026. ; ; register initialization
  3027. MOV RDX, [RBP+dadr] ; RDX reserved for dadr
  3028. MOV RBX, [RBP+ladr] ; RBX reserved for ladr
  3029. MOV RSI, [RBP+radr] ; RSI reserved for radr
  3030. MOV RAX, [RBP+len] ; RAX reserverd for length
  3031. MOV RCX, [RBP+stride] ; RCX reserved for stride
  3032. XORPD XMM2, XMM2 ;
  3033. XORPD XMM3, XMM3 ;
  3034. XORPD XMM4, XMM4 ;
  3035. XORPD XMM5, XMM5 ;
  3036. XORPD XMM6, XMM6 ;
  3037. XOR RDI, RDI ;
  3038. aligned4:
  3039. CMP RAX, 4 ;
  3040. JL aligned2 ; ; len < 4- > exit to singlepieces
  3041. MOV RSI, [RBP+radr] ;
  3042. ADD RSI, RDI ;
  3043. MOVAPD XMM7, [RBX] ;
  3044. MOVAPD XMM0, [RSI] ;
  3045. ADD RSI, RCX ;
  3046. MOVAPD XMM1, [RSI] ;
  3047. MULPD XMM0, XMM7 ;
  3048. ADDPD XMM2, XMM0 ;
  3049. ADD RSI, RCX ;
  3050. MOVAPD XMM0, [RSI] ;
  3051. MULPD XMM1, XMM7 ;
  3052. ADDPD XMM3, XMM1 ;
  3053. ADD RSI, RCX ;
  3054. MOVAPD XMM1, [RSI] ;
  3055. MULPD XMM0, XMM7 ;
  3056. ADDPD XMM4, XMM0 ;
  3057. ADD RSI, RCX ;
  3058. MOVAPD XMM0, [RSI] ;
  3059. MULPD XMM1, XMM7 ;
  3060. ADDPD XMM5, XMM1 ;
  3061. MULPD XMM0, XMM7 ;
  3062. ADDPD XMM6, XMM0 ;
  3063. ADD RBX, 16 ;
  3064. ADD RDI, 16 ;
  3065. MOV RSI, [RBP+radr] ;
  3066. ADD RSI, RDI ;
  3067. MOVAPD XMM7, [RBX] ;
  3068. MOVAPD XMM0, [RSI] ;
  3069. ADD RSI, RCX ;
  3070. MOVAPD XMM1, [RSI] ;
  3071. MULPD XMM0, XMM7 ;
  3072. ADDPD XMM2, XMM0 ;
  3073. ADD RSI, RCX ;
  3074. MOVAPD XMM0, [RSI] ;
  3075. MULPD XMM1, XMM7 ;
  3076. ADDPD XMM3, XMM1 ;
  3077. ADD RSI, RCX ;
  3078. MOVAPD XMM1, [RSI] ;
  3079. MULPD XMM0, XMM7 ;
  3080. ADDPD XMM4, XMM0 ;
  3081. ADD RSI, RCX ;
  3082. MOVAPD XMM0, [RSI] ;
  3083. MULPD XMM1, XMM7 ;
  3084. ADDPD XMM5, XMM1 ;
  3085. MULPD XMM0, XMM7 ;
  3086. ADDPD XMM6, XMM0 ;
  3087. ADD RBX, 16 ;
  3088. ADD RDI, 16 ;
  3089. SUB RAX, 4 ;
  3090. JMP aligned4 ;
  3091. aligned2:
  3092. CMP RAX, 2 ;
  3093. JL horizontaladd ; ; len < 4- > exit to singlepieces
  3094. MOV RSI, [RBP+radr] ;
  3095. ADD RSI, RDI ;
  3096. MOVAPD XMM7, [RBX] ;
  3097. MOVAPD XMM0, [RSI] ;
  3098. ADD RSI, RCX ;
  3099. MOVAPD XMM1, [RSI] ;
  3100. MULPD XMM0, XMM7 ;
  3101. ADDPD XMM2, XMM0 ;
  3102. ADD RSI, RCX ;
  3103. MOVAPD XMM0, [RSI] ;
  3104. MULPD XMM1, XMM7 ;
  3105. ADDPD XMM3, XMM1 ;
  3106. ADD RSI, RCX ;
  3107. MOVAPD XMM1, [RSI] ;
  3108. MULPD XMM0, XMM7 ;
  3109. ADDPD XMM4, XMM0 ;
  3110. ADD RSI, RCX ;
  3111. MOVAPD XMM0, [RSI] ;
  3112. MULPD XMM1, XMM7 ;
  3113. ADDPD XMM5, XMM1 ;
  3114. MULPD XMM0, XMM7 ;
  3115. ADDPD XMM6, XMM0 ;
  3116. ADD RBX, 16 ;
  3117. ADD RDI, 16 ;
  3118. SUB RAX, 2 ;
  3119. JMP aligned2 ;
  3120. horizontaladd: ;
  3121. MOVAPD XMM1, XMM2 ;
  3122. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3123. ADDPD XMM2, XMM1 ;
  3124. MOVAPD XMM1, XMM3 ;
  3125. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3126. ADDPD XMM3, XMM1 ;
  3127. MOVAPD XMM1, XMM4 ;
  3128. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3129. ADDPD XMM4, XMM1 ;
  3130. MOVAPD XMM1, XMM5 ;
  3131. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3132. ADDPD XMM5, XMM1 ;
  3133. MOVAPD XMM1, XMM6 ;
  3134. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3135. ADDPD XMM6, XMM1 ;
  3136. singlepieces: ;
  3137. CMP RAX, 0 ;
  3138. JLE store ; len <= 0- > exit
  3139. MOV RSI, [RBP+radr] ;
  3140. MOVSD XMM7, [RBX] ;
  3141. MOVSD XMM0, [RSI+RDI] ;
  3142. ADD RSI, RCX ;
  3143. MOVSD XMM1, [RSI+RDI] ;
  3144. MULSD XMM0, XMM7 ;
  3145. ADDSD XMM2, XMM0 ;
  3146. ADD RSI, RCX ;
  3147. MOVSD XMM0, [RSI+RDI] ;
  3148. MULSD XMM1, XMM7 ;
  3149. ADDSD XMM3, XMM1 ;
  3150. ADD RSI, RCX ;
  3151. MOVSD XMM1, [RSI+RDI] ;
  3152. MULSD XMM0, XMM7 ;
  3153. ADDSD XMM4, XMM0 ;
  3154. ADD RSI, RCX ;
  3155. MOVSD XMM1, [RSI+RDI] ;
  3156. MULSD XMM0, XMM7 ;
  3157. ADDSD XMM4, XMM0 ;
  3158. ADD RSI, RCX ;
  3159. MOVSD XMM0, [RSI+RDI] ;
  3160. MULSD XMM1, XMM7 ;
  3161. ADDSD XMM5, XMM1 ;
  3162. MULSD XMM0, XMM7 ;
  3163. ADDSD XMM6, XMM0 ;
  3164. ADD RBX, 4 (* INC(ladr,incl) *)
  3165. ADD RDI, 4 (* INC(radr,incr) *)
  3166. DEC RAX ; DEC(len)
  3167. JMP singlepieces ;
  3168. store:
  3169. MOVSD [RDX], XMM2 ;
  3170. ADD RDX, [RBP+incd] ;
  3171. MOVSD [RDX], XMM3 ;
  3172. ADD RDX, [RBP+incd] ;
  3173. MOVSD [RDX], XMM4 ;
  3174. ADD RDX, [RBP+incd] ;
  3175. MOVSD [RDX], XMM5 ;
  3176. ADD RDX, [RBP+incd] ;
  3177. MOVSD [RDX], XMM6 ;
  3178. end:
  3179. END AlignedSPXSSE5;
  3180. *)
  3181. (* sse version of scalar product *)
  3182. PROCEDURE AlignedSPXSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  3183. add: BOOLEAN );
  3184. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3185. ; register initialization
  3186. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  3187. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  3188. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  3189. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  3190. XORPD XMM0, XMM0 ;
  3191. CMP [RBP+add], 0 ; add?
  3192. JE aligned8 ; no add
  3193. MOVSD XMM0, [RDX] ;
  3194. aligned8:
  3195. CMP RAX, 8 ;
  3196. JL aligned2 ; len < 4- > EXIT TO singlepieces
  3197. MOVAPD XMM1, [RBX] ;
  3198. MOVAPD XMM2, [RBX+16] ;
  3199. MOVAPD XMM3, [RBX+32] ;
  3200. MOVAPD XMM4, [RCX] ;
  3201. MOVAPD XMM5, [RCX+16] ;
  3202. MOVAPD XMM6, [RCX+32] ;
  3203. MULPD XMM1, XMM4 ;
  3204. ADDPD XMM0, XMM1 ;
  3205. MULPD XMM2, XMM5 ;
  3206. ADDPD XMM0, XMM2 ;
  3207. MULPD XMM3, XMM6 ;
  3208. ADDPD XMM0, XMM3 ;
  3209. MOVAPD XMM7, [RBX+48] ;
  3210. MOVAPD XMM1, [RCX+48] ;
  3211. MULPD XMM1, XMM7 ;
  3212. ADDPD XMM0, XMM1 ;
  3213. ADD RBX, 64 ;
  3214. ADD RCX, 64 ;
  3215. SUB RAX, 8 ;
  3216. JMP aligned8 ;
  3217. ; LOOP FOR 2 pieces aligned
  3218. aligned4:
  3219. CMP RAX, 4 ;
  3220. JL aligned2 ; ; len < 4- > EXIT TO singlepieces
  3221. MOVAPD XMM1, [RBX] ;
  3222. MOVAPD XMM2, [RCX] ;
  3223. MOVAPD XMM3, [RBX+16] ;
  3224. MOVAPD XMM4, [RCX+16] ;
  3225. MULPD XMM1, XMM2 ;
  3226. ADDPD XMM0, XMM1 ;
  3227. MULPD XMM3, XMM4 ;
  3228. ADDPD XMM0, XMM3 ;
  3229. ADD RBX, 32 ;
  3230. ADD RCX, 32 ;
  3231. SUB RAX, 4 ;
  3232. JMP aligned4 ;
  3233. aligned2:
  3234. CMP RAX, 2 ;
  3235. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  3236. MOVAPD XMM1, [RBX] ;
  3237. MOVAPD XMM2, [RCX] ;
  3238. MULPD XMM1, XMM2 ;
  3239. ADDPD XMM0, XMM1 ;
  3240. ADD RBX, 16 ;
  3241. ADD RCX, 16 ;
  3242. SUB RAX, 2 ;
  3243. JMP aligned2 ;
  3244. horizontaladd: ;
  3245. MOVAPD XMM1, XMM0 ;
  3246. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3247. ADDPD XMM0, XMM1 ;
  3248. singlepieces: ;
  3249. CMP RAX, 0 ;
  3250. JLE store ; len <= 0- > EXIT
  3251. MOVSD XMM1, [RBX]
  3252. MOVSD XMM2, [RCX]
  3253. MULSD XMM1, XMM2
  3254. ADDSD XMM0, XMM1
  3255. ADD RBX, 8 ; INC(ladr, incl)
  3256. ADD RCX, 8 ; INC(radr, incr)
  3257. DEC RAX ; DEC(len)
  3258. JMP singlepieces ;
  3259. store:
  3260. MOVSD [RDX], XMM0 ;
  3261. endL:
  3262. END AlignedSPXSSE;
  3263. (*
  3264. PROCEDURE AlignedSPRSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  3265. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3266. ; register initialization
  3267. MOV RDX, [RBP+dadr] ; RDX reserved for dadr
  3268. MOV RBX, [RBP+ladr] ; RBX reserved for ladr
  3269. MOV RSI, [RBP+radr] ; RCX reserved for radr
  3270. MOV RAX, [RBP+len] ; RAX reserverd for length
  3271. MOV RCX, [RBP+stride] ;
  3272. XORPS XMM2, XMM2 ;
  3273. XORPS XMM3, XMM3 ;
  3274. XORPS XMM4, XMM4 ;
  3275. XORPS XMM5, XMM5 ;
  3276. XORPS XMM6, XMM6 ;
  3277. XOR RDI, RDI ;
  3278. aligned8:
  3279. CMP RAX, 8 ;
  3280. JL aligned4 ; ; len < 4- > exit to singlepieces
  3281. PREFETCH0 24[RBX] ;
  3282. ; PREFETCH0[RSI] ;
  3283. MOV RSI, [RBP+radr] ;
  3284. ADD RSI, RDI ;
  3285. MOVAPS XMM7, [RBX] ;
  3286. MOVAPS XMM0, [RSI] ;
  3287. ADD RSI, RCX ;
  3288. MOVAPS XMM1, [RSI] ;
  3289. MULPS XMM0, XMM7 ;
  3290. ADDPS XMM2, XMM0 ;
  3291. ADD RSI, RCX ;
  3292. MOVAPS XMM0, [RSI] ;
  3293. MULPS XMM1, XMM7 ;
  3294. ADDPS XMM3, XMM1 ;
  3295. ADD RSI, RCX ;
  3296. MOVAPS XMM1, [RSI] ;
  3297. MULPS XMM0, XMM7 ;
  3298. ADDPS XMM4, XMM0 ;
  3299. ADD RSI, RCX ;
  3300. MOVAPS XMM0, [RSI] ;
  3301. MULPS XMM1, XMM7 ;
  3302. ADDPS XMM5, XMM1 ;
  3303. MULPS XMM0, XMM7 ;
  3304. ADDPS XMM6, XMM0 ;
  3305. ADD RBX, 16 ;
  3306. ADD RDI, 16 ;
  3307. MOV RSI, [RBP+radr] ;
  3308. ADD RSI, RDI ;
  3309. MOVAPS XMM7, [RBX] ;
  3310. MOVAPS XMM0, [RSI] ;
  3311. ADD RSI, RCX ;
  3312. MOVAPS XMM1, [RSI] ;
  3313. MULPS XMM0, XMM7 ;
  3314. ADDPS XMM2, XMM0 ;
  3315. ADD RSI, RCX ;
  3316. MOVAPS XMM0, [RSI] ;
  3317. MULPS XMM1, XMM7 ;
  3318. ADDPS XMM3, XMM1 ;
  3319. ADD RSI, RCX ;
  3320. MOVAPS XMM1, [RSI] ;
  3321. MULPS XMM0, XMM7 ;
  3322. ADDPS XMM4, XMM0 ;
  3323. ADD RSI, RCX ;
  3324. MOVAPS XMM0, [RSI] ;
  3325. MULPS XMM1, XMM7 ;
  3326. ADDPS XMM5, XMM1 ;
  3327. MULPS XMM0, XMM7 ;
  3328. ADDPS XMM6, XMM0 ;
  3329. ADD RBX, 16 ;
  3330. ADD RDI, 16 ;
  3331. SUB RAX, 8 ;
  3332. JMP aligned8 ;
  3333. aligned4:
  3334. CMP RAX, 4 ;
  3335. JL horizontaladd ; ; len < 4- > exit to singlepieces
  3336. MOV RSI, [RBP+radr] ;
  3337. ADD RSI, RDI ;
  3338. MOVAPS XMM7, [RBX] ;
  3339. MOVAPS XMM0, [RSI] ;
  3340. ADD RSI, RCX ;
  3341. MOVAPS XMM1, [RSI] ;
  3342. MULPS XMM0, XMM7 ;
  3343. ADDPS XMM2, XMM0 ;
  3344. ADD RSI, RCX ;
  3345. MOVAPS XMM0, [RSI] ;
  3346. MULPS XMM1, XMM7 ;
  3347. ADDPS XMM3, XMM1 ;
  3348. ADD RSI, RCX ;
  3349. MOVAPS XMM1, [RSI] ;
  3350. MULPS XMM0, XMM7 ;
  3351. ADDPS XMM4, XMM0 ;
  3352. ADD RSI, RCX ;
  3353. MOVAPS XMM0, [RSI] ;
  3354. MULPS XMM1, XMM7 ;
  3355. ADDPS XMM5, XMM1 ;
  3356. MULPS XMM0, XMM7 ;
  3357. ADDPS XMM6, XMM0 ;
  3358. ADD RBX, 16 ;
  3359. ADD RDI, 16 ;
  3360. SUB RAX, 4 ;
  3361. JMP aligned4 ;
  3362. horizontaladd: ;
  3363. MOVLHPS XMM1, XMM2 ;
  3364. ADDPS XMM1, XMM2 ;
  3365. SHUFPS XMM2, XMM1, 48 ;
  3366. ADDPS XMM2, XMM1 ;
  3367. MOVHLPS XMM2, XMM2 ;
  3368. MOVLHPS XMM1, XMM3 ;
  3369. ADDPS XMM1, XMM3 ;
  3370. SHUFPS XMM3, XMM1, 48 ;
  3371. ADDPS XMM3, XMM1 ;
  3372. MOVHLPS XMM3, XMM3 ;
  3373. MOVLHPS XMM1, XMM4 ;
  3374. ADDPS XMM1, XMM4 ;
  3375. SHUFPS XMM4, XMM1, 48 ;
  3376. ADDPS XMM4, XMM1 ;
  3377. MOVHLPS XMM4, XMM4 ;
  3378. MOVLHPS XMM1, XMM5 ;
  3379. ADDPS XMM1, XMM5 ;
  3380. SHUFPS XMM5, XMM1, 48 ;
  3381. ADDPS XMM5, XMM1 ;
  3382. MOVHLPS XMM5, XMM5 ;
  3383. MOVLHPS XMM1, XMM6 ;
  3384. ADDPS XMM1, XMM6 ;
  3385. SHUFPS XMM6, XMM1, 48 ;
  3386. ADDPS XMM6, XMM1 ;
  3387. MOVHLPS XMM6, XMM6 ;
  3388. singlepieces: ;
  3389. CMP RAX, 0 ;
  3390. JLE store ; len <= 0- > exit
  3391. MOV RSI, [RBP+radr] ;
  3392. MOVSS XMM7, [RBX] ;
  3393. MOVSS XMM0, [RSI+RDI] ;
  3394. ADD RSI, RCX ;
  3395. MOVSS XMM1, [RSI+RDI] ;
  3396. MULSS XMM0, XMM7 ;
  3397. ADDSS XMM2, XMM0 ;
  3398. ADD RSI, RCX ;
  3399. MOVSS XMM0, [RSI+RDI] ;
  3400. MULSS XMM1, XMM7 ;
  3401. ADDSS XMM3, XMM1 ;
  3402. ADD RSI, RCX ;
  3403. MOVSS XMM1, [RSI+RDI] ;
  3404. MULSS XMM0, XMM7 ;
  3405. ADDSS XMM4, XMM0 ;
  3406. ADD RSI, RCX ;
  3407. MOVSS XMM0, [RSI+RDI] ;
  3408. MULSS XMM1, XMM7 ;
  3409. ADDSS XMM5, XMM1 ;
  3410. MULSS XMM0, XMM7 ;
  3411. ADDSS XMM6, XMM0 ;
  3412. ADD RBX, 4 (* INC(ladr,incl) *)
  3413. ADD RDI, 4 (* INC(radr,incr) *)
  3414. DEC RAX ; DEC(len)
  3415. JMP singlepieces ;
  3416. store:
  3417. MOVSS [RDX], XMM2 ;
  3418. ADD RDX, [RBP+incd] ;
  3419. MOVSS [RDX], XMM3 ;
  3420. ADD RDX, [RBP+incd] ;
  3421. MOVSS [RDX], XMM4 ;
  3422. ADD RDX, [RBP+incd] ;
  3423. MOVSS [RDX], XMM5 ;
  3424. ADD RDX, [RBP+incd] ;
  3425. MOVSS [RDX], XMM6 ;
  3426. end:
  3427. END AlignedSPRSSE5;
  3428. *)
  3429. PROCEDURE AlignedSPRSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  3430. add: BOOLEAN );
  3431. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3432. ; register initialization
  3433. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  3434. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  3435. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  3436. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  3437. XORPS XMM0, XMM0 ;
  3438. CMP [RBP+add], 0 ; add?
  3439. JE aligned16 ; no add
  3440. MOVSS XMM0, [RDX] ;
  3441. aligned16:
  3442. CMP RAX, 16 ;
  3443. JL aligned8 ; len < 4- > EXIT TO singlepieces
  3444. MOVAPS XMM1, [RBX] ;
  3445. MOVAPS XMM4, [RCX] ;
  3446. MOVAPS XMM2, [RBX+16] ;
  3447. MOVAPS XMM5, [RCX+16] ;
  3448. MULPS XMM1, XMM4 ;
  3449. ADDPS XMM0, XMM1 ;
  3450. MOVAPS XMM3, [RBX+32] ;
  3451. MOVAPS XMM6, [RCX+32] ;
  3452. MULPS XMM2, XMM5 ;
  3453. ADDPS XMM0, XMM2 ;
  3454. MOVAPS XMM7, [RBX+48] ;
  3455. MOVAPS XMM1, [RCX+48] ;
  3456. MULPS XMM3, XMM6 ;
  3457. ADDPS XMM0, XMM3 ;
  3458. MULPS XMM1, XMM7 ;
  3459. ADDPS XMM0, XMM1 ;
  3460. ADD RBX, 64 ;
  3461. ADD RCX, 64 ;
  3462. SUB RAX, 16 ;
  3463. JMP aligned16 ;
  3464. ; LOOP FOR 8 pieces aligned
  3465. aligned8:
  3466. CMP RAX, 8 ;
  3467. JL aligned4 ; ; len < 4- > EXIT TO singlepieces
  3468. MOVAPS XMM1, [RBX] ;
  3469. MOVAPS XMM4, [RCX] ;
  3470. MOVAPS XMM2, [RBX+16] ;
  3471. MOVAPS XMM5, [RCX+16] ;
  3472. MULPS XMM1, XMM4 ;
  3473. ADDPS XMM0, XMM1 ;
  3474. MULPS XMM2, XMM5 ;
  3475. ADDPS XMM0, XMM2 ;
  3476. ADD RBX, 32 ;
  3477. ADD RCX, 32 ;
  3478. SUB RAX, 8 ;
  3479. JMP aligned8 ;
  3480. aligned4:
  3481. CMP RAX, 4 ;
  3482. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  3483. MOVAPS XMM1, [RBX] ;
  3484. MOVAPS XMM2, [RCX] ;
  3485. MULPS XMM1, XMM2 ;
  3486. ADDPS XMM0, XMM1 ;
  3487. ADD RBX, 16 ;
  3488. ADD RCX, 16 ;
  3489. SUB RAX, 4 ;
  3490. JMP aligned4 ;
  3491. horizontaladd: ;
  3492. MOVAPS XMM1, XMM0 ;
  3493. ; 1*0 (* dest 0 -> dest 0 *) + 4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  3494. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  3495. ADDPS XMM1, XMM0 ;
  3496. MOVAPS XMM0, XMM1
  3497. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  3498. ADDPS XMM0, XMM1 ;
  3499. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  3500. singlepieces: ;
  3501. CMP RAX, 0 ;
  3502. JLE store ; len <= 0- > EXIT
  3503. MOVSS XMM1, [RBX]
  3504. MOVSS XMM2, [RCX]
  3505. MULSS XMM1, XMM2
  3506. ADDSS XMM0, XMM1
  3507. ADD RBX, 4 ; INC(ladr, incl)
  3508. ADD RCX, 4 ; INC(radr, incr)
  3509. DEC RAX ; DEC(len)
  3510. JMP singlepieces ;
  3511. store:
  3512. MOVSS [RDX], XMM0 ;
  3513. endL:
  3514. END AlignedSPRSSE;
  3515. (*
  3516. (* sse version of scalar product *)
  3517. PROCEDURE AlignedSPRSSE( ladr, radr, dadr, rows, stride, dinc, len: LONGINT );
  3518. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3519. ; register initialization
  3520. MOV RDI, [RBP+radr] ; radr start
  3521. MOV RDX, [RBP+dadr] ; RDX reserved for dadr
  3522. MOV RSI, [RBP+rows] ; outer loop counter
  3523. outerloop:
  3524. CMP RSI, 0 ;
  3525. JLE end ;
  3526. MOV RBX, [RBP+ladr] ; RBX reserved for ladr
  3527. MOV RCX, RDI ; RCX reserved for radr
  3528. MOV RAX, [RBP+len] ; RAX reserverd for length
  3529. XORPS XMM0, XMM0 ;
  3530. aligned16:
  3531. CMP RAX, 16 ;
  3532. JL aligned8 ; len < 4- > exit to singlepieces
  3533. MOVAPS XMM1, [RBX] ;
  3534. MOVAPS XMM2, [RBX+16] ;
  3535. MOVAPS XMM3, [RBX+32] ;
  3536. MOVAPS XMM4, [RCX] ;
  3537. MOVAPS XMM5, [RCX+16] ;
  3538. MOVAPS XMM6, [RCX+32] ;
  3539. MULPS XMM1, XMM4 ;
  3540. ADDPS XMM0, XMM1 ;
  3541. MULPS XMM2, XMM5 ;
  3542. ADDPS XMM0, XMM2 ;
  3543. MULPS XMM3, XMM6 ;
  3544. ADDPS XMM0, XMM3 ;
  3545. MOVAPS XMM7, [RBX+48] ;
  3546. MOVAPS XMM1, [RCX+48] ;
  3547. MULPS XMM1, XMM7 ;
  3548. ADDPS XMM0, XMM1 ;
  3549. ADD RBX, 64 ;
  3550. ADD RCX, 64 ;
  3551. SUB RAX, 16 ;
  3552. JMP aligned16 ;
  3553. ; loop for 8 pieces aligned
  3554. aligned8:
  3555. CMP RAX, 8 ;
  3556. JL aligned4 ; ; len < 4- > exit to singlepieces
  3557. MOVAPS XMM1, [RBX] ;
  3558. MOVAPS XMM2, [RBX+16] ;
  3559. MOVAPS XMM4, [RCX] ;
  3560. MOVAPS XMM5, [RCX+16] ;
  3561. MULPS XMM1, XMM4 ;
  3562. ADDPS XMM0, XMM1 ;
  3563. MULPS XMM2, XMM5 ;
  3564. ADDPS XMM0, XMM2 ;
  3565. ADD RBX, 32 ;
  3566. ADD RCX, 32 ;
  3567. SUB RAX, 8 ;
  3568. JMP aligned8 ;
  3569. aligned4:
  3570. CMP RAX, 4 ;
  3571. JL horizontaladd ; ; len < 4- > exit to singlepieces
  3572. MOVAPS XMM1, [RBX] ;
  3573. MOVAPS XMM2, [RCX] ;
  3574. MULPS XMM1, XMM2 ;
  3575. ADDPS XMM0, XMM1 ;
  3576. ADD RBX, 16 ;
  3577. ADD RCX, 16 ;
  3578. SUB RAX, 4 ;
  3579. JMP aligned4 ;
  3580. horizontaladd: ;
  3581. MOVAPS XMM1, XMM0 ;
  3582. SHUFPS XMM1, XMM1, 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  3583. ADDPS XMM1, XMM0 ;
  3584. MOVAPS XMM0, XMM1
  3585. SHUFPS XMM0, XMM0, 16*3 ; (* src 3-> dest 2 *)
  3586. ADDPS XMM0, XMM1 ;
  3587. SHUFPS XMM0, XMM0, 1*2 ; (* dest 2 -> dest 0 *)
  3588. singlepieces: ;
  3589. CMP RAX, 0 ;
  3590. JLE store ; len <= 0- > exit
  3591. MOVSS XMM1, [RBX]
  3592. MOVSS XMM2, [RCX]
  3593. MULSS XMM1, XMM2
  3594. ADDSS XMM0, XMM1
  3595. ADD RBX, 4 (* INC(ladr,incl) *)
  3596. ADD RCX, 4 (* INC(radr,incr) *)
  3597. DEC RAX ; DEC(len)
  3598. JMP singlepieces ;
  3599. store:
  3600. MOVSS [RDX], XMM0 ;
  3601. ADD RDX, [RBP+dinc] ;
  3602. ADD RDI, [RBP+stride] ;
  3603. DEC RSI ;
  3604. JMP outerloop ;
  3605. end:
  3606. END AlignedSPRSSE;
  3607. *)
  3608. PROCEDURE Copy4( ladr, dadr: ADDRESS; linc, dinc, len: SIZE);
  3609. CODE {SYSTEM.AMD64}
  3610. MOV RSI, [RBP+ladr] ; RCX := ladr
  3611. MOV RDI, [RBP+dadr] ; RDX := dadr
  3612. MOV RCX, [RBP+len] ; RBX := len
  3613. MOV RAX, [RBP+linc] ;
  3614. CMP RAX, 4 ;
  3615. JNE loopL ;
  3616. MOV RAX, [RBP+dinc] ;
  3617. CMP RAX, 4 ;
  3618. JNE loopL ;
  3619. fastmove:
  3620. CLD ; incremental
  3621. REP ;
  3622. MOVSD ; move rest IN one byte steps
  3623. JMP endL ;
  3624. loopL:
  3625. CMP RCX, 0 ;
  3626. JLE endL ; WHILE RCX > 0 DO
  3627. MOV EAX, [RSI] ; RAX := SYSTEM.GET32(RSI)
  3628. MOV [RDI], EAX ; SYSTEM.PUT32(RDI, RAX))
  3629. ADD RSI, [RBP+linc] ; INC(RSI, linc)
  3630. ADD RDI, [RBP+dinc] ; INC(RDI, rinc)
  3631. DEC RCX ; DEC(RCX)
  3632. JMP loopL
  3633. endL:
  3634. END Copy4;
  3635. PROCEDURE Copy8( ladr, dadr: ADDRESS; linc, dinc, len: SIZE );
  3636. CODE {SYSTEM.AMD64}
  3637. MOV RSI, [RBP+ladr] ; RCX := ladr
  3638. MOV RDI, [RBP+dadr] ; RDX := dadr
  3639. MOV RCX, [RBP+len] ; RBX := len
  3640. MOV RAX, [RBP+linc] ;
  3641. CMP RAX, 8 ;
  3642. JNE loopL ;
  3643. MOV RAX, [RBP+dinc] ;
  3644. CMP RAX, 8 ;
  3645. JNE loopL ;
  3646. fastmove:
  3647. SHL RCX, 1 ;
  3648. CLD ; incremental
  3649. REP ;
  3650. MOVSD ; move rest IN one byte steps
  3651. JMP endL ;
  3652. loopL:
  3653. CMP RCX, 0 ;
  3654. JLE endL ; WHILE RBX > 0 DO
  3655. MOV RAX, [RSI] ; RAX := SYSTEM.GET64(RCX)
  3656. MOV [RDI], RAX ; SYSTEM.PUT64(RDX, RAX))
  3657. ADD RSI, [RBP+linc] ; INC(RCX, linc)
  3658. ADD RDI, [RBP+dinc] ; INC(RDX, rinc)
  3659. DEC RCX ; DEC(RBX)
  3660. JMP loopL
  3661. endL:
  3662. END Copy8;
  3663. PROCEDURE Transpose4A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3664. CODE {SYSTEM.AMD64}
  3665. startrows:
  3666. MOV RAX, [RBP+rows] ;
  3667. startouter:
  3668. CMP RAX, 0 ;
  3669. JLE endL ;
  3670. MOV RSI, [RBP+ladr] ;
  3671. MOV RDI, [RBP+dadr] ;
  3672. MOV RBX, [RBP+linc] ;
  3673. MOV RCX, [RBP+dstride] ;
  3674. MOV RAX, [RBP+cols] ;
  3675. startinner:
  3676. CMP RAX, 0 ;
  3677. JLE endinner ;
  3678. MOV RDX, [RSI] ;
  3679. MOV [RDI], RDX ;
  3680. ADD RSI, RBX ;
  3681. ADD RDI, RCX ;
  3682. DEC RAX ;
  3683. JMP startinner ;
  3684. endinner:
  3685. MOV RSI, [RBP+ladr] ;
  3686. ADD RSI, [RBP+lstride] ;
  3687. MOV [RBP+ladr], RSI
  3688. MOV RDI, [RBP+dadr] ;
  3689. ADD RDI, [RBP+dinc] ;
  3690. MOV [RBP+dadr], RDI ;
  3691. MOV RAX, [RBP+rows] ;
  3692. DEC RAX ;
  3693. MOV [RBP+rows], RAX ;
  3694. JMP startouter ;
  3695. endL:
  3696. END Transpose4A;
  3697. PROCEDURE Transpose4( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3698. VAR l, d, c: SIZE; BlockSize: SIZE;
  3699. BEGIN
  3700. BlockSize :=
  3701. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3702. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3703. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3704. BlockSize := MAX( 8, BlockSize );
  3705. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3706. WHILE (rows >= BlockSize) DO
  3707. c := cols; l := ladr; d := dadr;
  3708. WHILE (c >= BlockSize) DO
  3709. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3710. BlockSize );
  3711. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3712. INC( d, BlockSize * dstride );
  3713. END;
  3714. IF c > 0 THEN
  3715. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3716. END;
  3717. DEC( rows, BlockSize ); INC( ladr, BlockSize * lstride );
  3718. INC( dadr, BlockSize * dinc );
  3719. END;
  3720. IF (rows > 0) THEN
  3721. c := cols; l := ladr; d := dadr;
  3722. WHILE (c >= BlockSize) DO
  3723. Transpose4A( l, d, lstride, linc, dstride, dinc, rows,
  3724. BlockSize );
  3725. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3726. INC( d, BlockSize * dstride );
  3727. END;
  3728. IF c > 0 THEN
  3729. Transpose4A( l, d, lstride, linc, dstride, dinc, rows, c );
  3730. END;
  3731. END;
  3732. END Transpose4;
  3733. PROCEDURE Transpose8( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3734. VAR l, d, c: SIZE; BlockSize: SIZE;
  3735. BEGIN
  3736. BlockSize :=
  3737. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3738. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3739. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3740. BlockSize := MAX( 8, BlockSize );
  3741. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3742. WHILE (rows >= BlockSize) DO
  3743. c := cols; l := ladr; d := dadr;
  3744. WHILE (c >= BlockSize) DO
  3745. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3746. BlockSize );
  3747. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3748. INC( d, BlockSize * dstride );
  3749. END;
  3750. IF c > 0 THEN
  3751. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3752. END;
  3753. DEC( rows, BlockSize ); INC( ladr, lstride * BlockSize );
  3754. INC( dadr, dinc * BlockSize );
  3755. END;
  3756. IF (rows > 0) THEN
  3757. c := cols; l := ladr; d := dadr;
  3758. WHILE (c >= BlockSize) DO
  3759. Transpose8A( l, d, lstride, linc, dstride, dinc, rows,
  3760. BlockSize );
  3761. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3762. INC( d, BlockSize * dstride );
  3763. END;
  3764. IF c > 0 THEN
  3765. Transpose8A( l, d, lstride, linc, dstride, dinc, rows, c );
  3766. END;
  3767. END;
  3768. END Transpose8;
  3769. PROCEDURE Transpose8A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3770. CODE {SYSTEM.AMD64}
  3771. startrows:
  3772. MOV RAX, [RBP+rows] ;
  3773. startouter:
  3774. CMP RAX, 0 ;
  3775. JLE endL ;
  3776. MOV RSI, [RBP+ladr] ;
  3777. MOV RDI, [RBP+dadr] ;
  3778. MOV RBX, [RBP+linc] ;
  3779. MOV RCX, [RBP+dstride] ;
  3780. MOV RAX, [RBP+cols] ;
  3781. startinner:
  3782. CMP RAX, 0 ;
  3783. JLE endinner ;
  3784. MOV RDX, [RSI] ;
  3785. MOV [RDI], RDX ;
  3786. MOV RDX, [RSI+4] ;
  3787. MOV [RDI+4], RDX ;
  3788. ADD RSI, RBX ;
  3789. ADD RDI, RCX ;
  3790. DEC RAX ;
  3791. JMP startinner ;
  3792. endinner:
  3793. MOV RSI, [RBP+ladr] ;
  3794. ADD RSI, [RBP+lstride] ;
  3795. MOV [RBP+ladr], RSI
  3796. MOV RDI, [RBP+dadr] ;
  3797. ADD RDI, [RBP+dinc] ;
  3798. MOV [RBP+dadr], RDI ;
  3799. MOV RAX, [RBP+rows] ;
  3800. DEC RAX ;
  3801. MOV [RBP+rows], RAX ;
  3802. JMP startouter ;
  3803. endL:
  3804. END Transpose8A;
  3805. PROCEDURE SSEMul24BlockR( VAR CbFirst: SIZE;
  3806. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3807. add: BOOLEAN );
  3808. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3809. MatrixOfResultsSetup:
  3810. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3811. RowOfResultsLoop:
  3812. MOV RBX, 0 ; counter FOR columns IN B-Cb
  3813. DotProductSetup:
  3814. MOV RSI, [RBP+matrixA] ; matrixA
  3815. MOV RDI, [RBP+matrixB] ; matrixB
  3816. LEA RDI, [RDI+RBX*4] ; current position IN matrixB
  3817. XORPS XMM2, XMM2
  3818. XORPS XMM3, XMM3
  3819. XORPS XMM4, XMM4
  3820. XORPS XMM5, XMM5
  3821. XORPS XMM6, XMM6
  3822. XORPS XMM7, XMM7
  3823. MOV RAX, 0 ;
  3824. MOV AL, [RBP+add] ;
  3825. CMP AL, 0 ; add?
  3826. JE DotProductLoop ;
  3827. MOV RAX, [RBP+matrixC] ; matrixC
  3828. LEA RAX, [RAX+RBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3829. MOVUPS XMM2, [RAX]
  3830. MOVUPS XMM3, [RAX+16]
  3831. MOVUPS XMM4, [RAX+32]
  3832. MOVUPS XMM5, [RAX+48]
  3833. MOVUPS XMM6, [RAX+64]
  3834. MOVUPS XMM7, [RAX+80]
  3835. MOV RAX, 0
  3836. DotProductLoop:
  3837. MOV RDX, [RSI+RAX*4]
  3838. SHL RDX, 1
  3839. CMP RDX, 0
  3840. JE SparseEntryEscape
  3841. MOVSS XMM0, [RSI+RAX*4]
  3842. SHUFPS XMM0, XMM0, 0H
  3843. MOVUPS XMM1, [RDI]
  3844. MULPS XMM1, XMM0
  3845. ADDPS XMM2, XMM1
  3846. MOVUPS XMM1, [RDI+16]
  3847. MULPS XMM1, XMM0
  3848. ADDPS XMM3, XMM1
  3849. MOVUPS XMM1, [RDI+32]
  3850. MULPS XMM1, XMM0
  3851. ADDPS XMM4, XMM1
  3852. MOVUPS XMM1, [RDI+48]
  3853. MULPS XMM1, XMM0
  3854. ADDPS XMM5, XMM1
  3855. MOVUPS XMM1, [RDI+64]
  3856. MULPS XMM1, XMM0
  3857. ADDPS XMM6, XMM1
  3858. MOVUPS XMM1, [RDI+80]
  3859. MULPS XMM1, XMM0
  3860. ADDPS XMM7, XMM1
  3861. SparseEntryEscape:
  3862. ADD RDI, [RBP+StrideB] ; StrideB
  3863. INC RAX
  3864. CMP RAX, [RBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3865. JL DotProductLoop
  3866. ; endL DopProductLoop
  3867. MOV RAX, [RBP+matrixC] ; matrixC
  3868. LEA RAX, [RAX+RBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3869. MOVUPS [RAX], XMM2
  3870. MOVUPS [RAX+16], XMM3
  3871. MOVUPS [RAX+32], XMM4
  3872. MOVUPS [RAX+48], XMM5
  3873. MOVUPS [RAX+64], XMM6
  3874. MOVUPS [RAX+80], XMM7
  3875. ADD RBX, 24 ; move over TO next batch OF 24
  3876. MOV RDX, RBX
  3877. ADD RDX, 24
  3878. CMP RDX, [RBP+Cb] ; Cb, check TO see IF row IS complete
  3879. JLE DotProductSetup
  3880. ; endL RowOfResultsLoop
  3881. MOV RAX, [RBP+matrixA] ; matrixA
  3882. ADD RAX, [RBP+StrideA] ; StrideA
  3883. MOV [RBP+matrixA], RAX ; matrixA
  3884. MOV RAX, [RBP+matrixC] ; matrixC
  3885. ADD RAX, [RBP+StrideC] ; StrideC
  3886. MOV [RBP+matrixC], RAX ; matrixC
  3887. INC RCX
  3888. CMP RCX, [RBP+Ra] ; Ra
  3889. JL RowOfResultsLoop
  3890. Done:
  3891. MOV RAX, [RBP+CbFirst] ; CbFirst
  3892. MOV [RAX], RBX ;
  3893. END SSEMul24BlockR;
  3894. (*! might be better to make a 10Block operation and utilize 2 registers for temporary calculations, see article abaout Emmerald*)
  3895. PROCEDURE SSEMul12BlockX( VAR CbFirst: SIZE;
  3896. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC :ADDRESS;
  3897. add: BOOLEAN );
  3898. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3899. MatrixOfResultsSetup:
  3900. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3901. RowOfResultsLoop:
  3902. MOV RBX, 0 ; counter FOR columns IN B-Cb
  3903. DotProductSetup:
  3904. MOV RSI, [RBP+matrixA] ; matrixA
  3905. MOV RDI, [RBP+matrixB] ; matrixB
  3906. LEA RDI, [RDI+RBX*8]
  3907. XORPD XMM2, XMM2
  3908. XORPD XMM3, XMM3
  3909. XORPD XMM4, XMM4
  3910. XORPD XMM5, XMM5
  3911. XORPD XMM6, XMM6
  3912. XORPD XMM7, XMM7
  3913. MOV RAX, 0 ;
  3914. MOV AL, [RBP+add] ;
  3915. CMP AL, 0 ; add?
  3916. JE DotProductLoop ;
  3917. MOV RAX, [RBP+matrixC] ; matrixC
  3918. LEA RAX, [RAX+RBX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3919. MOVUPD XMM2, [RAX]
  3920. MOVUPD XMM3, [RAX+16]
  3921. MOVUPD XMM4, [RAX+32]
  3922. MOVUPD XMM5, [RAX+48]
  3923. MOVUPD XMM6, [RAX+64]
  3924. MOVUPD XMM7, [RAX+80]
  3925. MOV RAX, 0
  3926. DotProductLoop:
  3927. ; MOV RDX, [RSI+RAX*8]
  3928. ; SHL RDX, 1
  3929. ; CMP RDX, 0
  3930. ; JE SparseEntryEscape
  3931. MOVSD XMM0, [RSI+RAX*8]
  3932. SHUFPD XMM0, XMM0, 0H
  3933. MOVUPD XMM1, [RDI]
  3934. MULPD XMM1, XMM0
  3935. ADDPD XMM2, XMM1
  3936. MOVUPD XMM1, [RDI+16]
  3937. MULPD XMM1, XMM0
  3938. ADDPD XMM3, XMM1
  3939. MOVUPD XMM1, [RDI+32]
  3940. MULPD XMM1, XMM0
  3941. ADDPD XMM4, XMM1
  3942. MOVUPD XMM1, [RDI+48]
  3943. MULPD XMM1, XMM0
  3944. ADDPD XMM5, XMM1
  3945. MOVUPD XMM1, [RDI+64]
  3946. MULPD XMM1, XMM0
  3947. ADDPD XMM6, XMM1
  3948. MOVUPD XMM1, [RDI+80]
  3949. MULPD XMM1, XMM0
  3950. ADDPD XMM7, XMM1
  3951. SparseEntryEscape:
  3952. ADD RDI, [RBP+StrideB] ; StrideB
  3953. INC RAX
  3954. CMP RAX, [RBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3955. JL DotProductLoop ; endL DopProductLoop
  3956. MOV RAX , [RBP+matrixC] ; matrixC
  3957. LEA RAX, [RAX+RBX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3958. MOVUPD [RAX], XMM2
  3959. MOVUPD [RAX+16], XMM3
  3960. MOVUPD [RAX+32], XMM4
  3961. MOVUPD [RAX+48], XMM5
  3962. MOVUPD [RAX+64], XMM6
  3963. MOVUPD [RAX+80], XMM7
  3964. ADD RBX, 12 ; move over TO next batch OF 12
  3965. MOV RDX, RBX
  3966. ADD RDX, 12
  3967. CMP RDX, [RBP+Cb] ; Cb, check TO see IF row IS complete
  3968. JLE DotProductSetup ; end RowOfResultsLoop
  3969. MOV RAX , [RBP+matrixA] ; matrixA
  3970. ADD RAX, [RBP+StrideA] ; StrideA
  3971. MOV [RBP+matrixA], RAX ; matrixA
  3972. MOV RAX, [RBP+matrixC] ; matrixC
  3973. ADD RAX, [RBP+StrideC] ; StrideC
  3974. MOV [RBP+matrixC], RAX ; matrixC
  3975. INC RCX
  3976. CMP RCX, [RBP+Ra] ; Ra
  3977. JL RowOfResultsLoop
  3978. Done:
  3979. MOV RAX, [RBP+CbFirst] ; CbFirst
  3980. MOV [RAX], RBX ;
  3981. END SSEMul12BlockX;
  3982. PROCEDURE SSEMul16BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3983. add: BOOLEAN );
  3984. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3985. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3986. DotProductSetup:
  3987. MOV RSI, [RBP+matrixA] ; matrixA
  3988. MOV RDI, [RBP+matrixB] ; matrixB
  3989. MOV RDX, [RBP+CbFrom] ; CbFrom
  3990. LEA RDI, [RDI+RDX*4]
  3991. XORPS XMM2, XMM2
  3992. XORPS XMM3, XMM3
  3993. XORPS XMM4, XMM4
  3994. XORPS XMM5, XMM5
  3995. MOV RAX, 0 ;
  3996. MOV AL, [RBP+add] ;
  3997. CMP AL, 0 ; add?
  3998. JE DotProductLoop ;
  3999. MOV RAX, [RBP+matrixC] ; matrixC
  4000. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally
  4001. MOVUPS XMM2, [RAX]
  4002. MOVUPS XMM3, [RAX+16]
  4003. MOVUPS XMM4, [RAX+32]
  4004. MOVUPS XMM5, [RAX+48]
  4005. MOV RAX, 0
  4006. DotProductLoop:
  4007. MOV RDX, [RSI+RAX*4]
  4008. SHL RDX, 1
  4009. CMP RDX, 0
  4010. JE SparseEntryEscape
  4011. MOVSS XMM0, [RSI+RAX*4]
  4012. SHUFPS XMM0, XMM0, 0H
  4013. MOVUPS XMM1, [RDI]
  4014. MULPS XMM1, XMM0
  4015. ADDPS XMM2, XMM1
  4016. MOVUPS XMM1, [RDI+16]
  4017. MULPS XMM1, XMM0
  4018. ADDPS XMM3, XMM1
  4019. MOVUPS XMM1, [RDI+32]
  4020. MULPS XMM1, XMM0
  4021. ADDPS XMM4, XMM1
  4022. MOVUPS XMM1, [RDI+48]
  4023. MULPS XMM1, XMM0
  4024. ADDPS XMM5, XMM1
  4025. SparseEntryEscape:
  4026. ADD RDI, [RBP+StrideB] ; StrideB
  4027. INC RAX
  4028. CMP RAX, [RBP+Ca] ; Ca
  4029. JL DotProductLoop ; end DotProductLoop
  4030. MOV RAX , [RBP+matrixC] ; matrixC
  4031. MOV RDX, [RBP+CbFrom] ; CbFirst
  4032. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 12
  4033. MOVUPS [RAX], XMM2
  4034. MOVUPS [RAX+16], XMM3
  4035. MOVUPS [RAX+32], XMM4
  4036. MOVUPS [RAX+48], XMM5
  4037. MOV RAX, [RBP+matrixA] ; matrixA
  4038. ADD RAX, [RBP+StrideA] ; StrideA
  4039. MOV [RBP+matrixA], RAX ; matrixA
  4040. MOV RAX, [RBP+matrixC] ; matrixC
  4041. ADD RAX, [RBP+StrideC] ; StrideC
  4042. MOV [RBP+matrixC], RAX ; matrixC
  4043. INC RCX
  4044. CMP RCX, [RBP+Ra] ; Ra
  4045. JL DotProductSetup ;
  4046. END SSEMul16BlockR;
  4047. PROCEDURE SSEMul8BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  4048. add: BOOLEAN );
  4049. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  4050. MOV RCX, 0 ; counter FOR rows IN A-Ra
  4051. DotProductSetup:
  4052. MOV RSI, [RBP+matrixA] ; matrixA
  4053. MOV RDI, [RBP+matrixB] ; matrixB
  4054. MOV RDX, [RBP+CbFrom] ; CbFrom
  4055. LEA RDI, [RDI+RDX*8]
  4056. XORPD XMM2, XMM2
  4057. XORPD XMM3, XMM3
  4058. XORPD XMM4, XMM4
  4059. XORPD XMM5, XMM5
  4060. MOV RAX, 0 ;
  4061. MOV AL, [RBP+add] ;
  4062. CMP AL, 0 ; add?
  4063. JE DotProductLoop ;
  4064. MOV RAX, [RBP+matrixC] ; matrixC
  4065. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 24
  4066. MOVUPD XMM2, [RAX]
  4067. MOVUPD XMM3, [RAX+16]
  4068. MOVUPD XMM4, [RAX+32]
  4069. MOVUPD XMM5, [RAX+48]
  4070. MOV RAX, 0
  4071. DotProductLoop:
  4072. ; MOV RDX, [RSI+RAX*8]
  4073. ; SHL RDX, 1
  4074. ; CMP RDX, 0
  4075. ; JE SparseEntryEscape
  4076. MOVSD XMM0, [RSI+RAX*8]
  4077. SHUFPD XMM0, XMM0, 0H
  4078. MOVUPD XMM1, [RDI]
  4079. MULPD XMM1, XMM0
  4080. ADDPD XMM2, XMM1
  4081. MOVUPD XMM1, [RDI+16]
  4082. MULPD XMM1, XMM0
  4083. ADDPD XMM3, XMM1
  4084. MOVUPD XMM1, [RDI+32]
  4085. MULPD XMM1, XMM0
  4086. ADDPD XMM4, XMM1
  4087. MOVUPD XMM1, [RDI+48]
  4088. MULPD XMM1, XMM0
  4089. ADDPD XMM5, XMM1
  4090. SparseEntryEscape:
  4091. ADD RDI, [RBP+StrideB] ; StrideB
  4092. INC RAX
  4093. CMP RAX, [RBP+Ca] ; Ca
  4094. JL DotProductLoop ; end DotProductLoop
  4095. MOV RAX , [RBP+matrixC] ; matrixC
  4096. MOV RDX, [RBP+CbFrom] ; CbFirst
  4097. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 12
  4098. MOVUPD [RAX], XMM2
  4099. MOVUPD [RAX+16], XMM3
  4100. MOVUPD [RAX+32], XMM4
  4101. MOVUPD [RAX+48], XMM5
  4102. MOV RAX, [RBP+matrixA] ; matrixA
  4103. ADD RAX, [RBP+StrideA] ; StrideA
  4104. MOV [RBP+matrixA], RAX ; matrixA
  4105. MOV RAX, [RBP+matrixC] ; matrixC
  4106. ADD RAX, [RBP+StrideC] ; StrideC
  4107. MOV [RBP+matrixC], RAX ; matrixC
  4108. INC RCX
  4109. CMP RCX, [RBP+Ra] ; Ra
  4110. JL DotProductSetup ;
  4111. END SSEMul8BlockX;
  4112. PROCEDURE SSEMul8BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  4113. add: BOOLEAN );
  4114. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  4115. MOV RCX, 0 ; counter FOR rows IN A-Ra
  4116. DotProductSetup:
  4117. MOV RSI, [RBP+matrixA] ; matrixA
  4118. MOV RDI, [RBP+matrixB] ; matrixB
  4119. MOV RDX, [RBP+CbFrom] ; CbFrom
  4120. LEA RDI, [RDI+RDX*4]
  4121. XORPS XMM2, XMM2
  4122. XORPS XMM3, XMM3
  4123. MOV RAX, 0 ;
  4124. MOV AL, [RBP+add] ;
  4125. CMP AL, 0 ; add?
  4126. JE DotProductLoop ;
  4127. MOV RAX, [RBP+matrixC] ; matrixC
  4128. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  4129. MOVUPS XMM2, [RAX]
  4130. MOVUPS XMM3, [RAX+16]
  4131. MOV RAX, 0
  4132. DotProductLoop:
  4133. MOV RDX, [RSI+RAX*4]
  4134. SHL RDX, 1
  4135. CMP RDX, 0
  4136. JE SparseEntryEscape
  4137. MOVSS XMM0, [RSI+RAX*4]
  4138. SHUFPS XMM0, XMM0, 0H
  4139. MOVUPS XMM1, [RDI]
  4140. MULPS XMM1, XMM0
  4141. ADDPS XMM2, XMM1
  4142. MOVUPS XMM1, [RDI+16]
  4143. MULPS XMM1, XMM0
  4144. ADDPS XMM3, XMM1
  4145. SparseEntryEscape:
  4146. ADD RDI, [RBP+StrideB] ; StrideB
  4147. INC RAX
  4148. CMP RAX, [RBP+Ca] ; Ca
  4149. JL DotProductLoop ; end DotProductLoop
  4150. MOV RAX , [RBP+matrixC] ; matrixC
  4151. MOV RDX, [RBP+CbFrom] ; CbFrom
  4152. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  4153. MOVUPS [RAX], XMM2
  4154. MOVUPS [RAX+16], XMM3
  4155. MOV RAX, [RBP+matrixA] ; matrixA
  4156. ADD RAX, [RBP+StrideA] ; StrideA
  4157. MOV [RBP+matrixA], RAX ; matrixA
  4158. MOV RAX, [RBP+matrixC] ; matrixC
  4159. ADD RAX, [RBP+StrideC] ; StrideC
  4160. MOV [RBP+matrixC], RAX ; matrixC
  4161. INC RCX
  4162. CMP RCX, [RBP+Ra] ; Ra
  4163. JL DotProductSetup ;
  4164. END SSEMul8BlockR;
  4165. PROCEDURE SSEMul4BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  4166. add: BOOLEAN );
  4167. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  4168. MOV RCX, 0 ; counter FOR rows IN A-Ra
  4169. DotProductSetup:
  4170. MOV RAX, 0 ; cols IN A
  4171. MOV RSI, [RBP+matrixA] ; matrixA
  4172. MOV RDI, [RBP+matrixB] ; matrixB
  4173. MOV RDX, [RBP+CbFrom] ; CbFrom
  4174. LEA RDI, [RDI+RDX*8]
  4175. XORPS XMM2, XMM2
  4176. XORPS XMM3, XMM3
  4177. MOV RAX, 0 ;
  4178. MOV AL, [RBP+add] ;
  4179. CMP AL, 0 ; add?
  4180. JE DotProductLoop ;
  4181. MOV RAX, [RBP+matrixC] ; matrixC
  4182. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  4183. MOVUPD XMM2, [RAX]
  4184. MOVUPD XMM3, [RAX+16]
  4185. MOV RAX, 0
  4186. DotProductLoop:
  4187. ; MOV RDX, [RSI+RAX*8]
  4188. ; SHL RDX, 1
  4189. ; CMP RDX, 0
  4190. ; JE SparseEntryEscape
  4191. MOVSD XMM0, [RSI+RAX*8]
  4192. SHUFPD XMM0, XMM0, 0H
  4193. MOVUPD XMM1, [RDI]
  4194. MULPD XMM1, XMM0
  4195. ADDPD XMM2, XMM1
  4196. MOVUPD XMM1, [RDI+16]
  4197. MULPD XMM1, XMM0
  4198. ADDPD XMM3, XMM1
  4199. SparseEntryEscape:
  4200. ADD RDI, [RBP+StrideB] ; StrideB
  4201. INC RAX
  4202. CMP RAX, [RBP+Ca] ; Ca
  4203. JL DotProductLoop ; end DotProductLoop
  4204. MOV RAX , [RBP+matrixC] ; matrixC
  4205. MOV RDX, [RBP+CbFrom] ; CbFrom
  4206. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  4207. MOVUPD [RAX], XMM2
  4208. MOVUPD [RAX+16], XMM3
  4209. MOV RAX, [RBP+matrixA] ; matrixA
  4210. ADD RAX, [RBP+StrideA] ; StrideA
  4211. MOV [RBP+matrixA], RAX ; matrixA
  4212. MOV RAX, [RBP+matrixC] ; matrixC
  4213. ADD RAX, [RBP+StrideC] ; StrideC
  4214. MOV [RBP+matrixC], RAX ; matrixC
  4215. INC RCX
  4216. CMP RCX, [RBP+Ra] ; Ra
  4217. JL DotProductSetup ;
  4218. END SSEMul4BlockX;
  4219. PROCEDURE SSEMul4BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  4220. add: BOOLEAN );
  4221. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  4222. MOV RCX, 0 ; counter FOR rows IN A-Ra
  4223. DotProductSetup:
  4224. MOV RAX, 0 ; cols IN A
  4225. MOV RSI, [RBP+matrixA] ; matrixA
  4226. MOV RDI, [RBP+matrixB] ; matrixB
  4227. MOV RDX, [RBP+CbFrom] ; CbFrom
  4228. LEA RDI, [RDI+RDX*4]
  4229. XORPS XMM2, XMM2
  4230. MOV RAX, 0 ;
  4231. MOV AL, [RBP+add] ;
  4232. CMP AL, 0 ; add?
  4233. JE DotProductLoop ;
  4234. MOV RAX, [RBP+matrixC] ; matrixC
  4235. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  4236. MOVUPS XMM2, [RAX]
  4237. MOV RAX, 0
  4238. DotProductLoop:
  4239. MOV RDX, [RSI+RAX*4]
  4240. SHL RDX, 1
  4241. CMP RDX, 0
  4242. JE SparseEntryEscape
  4243. MOVSS XMM0, [RSI+RAX*4]
  4244. SHUFPS XMM0, XMM0, 0H
  4245. MOVUPS XMM1, [RDI]
  4246. MULPS XMM1, XMM0
  4247. ADDPS XMM2, XMM1
  4248. SparseEntryEscape:
  4249. ADD RDI, [RBP+StrideB] ; StrideB
  4250. INC RAX
  4251. CMP RAX, [RBP+Ca] ; Ca
  4252. JL DotProductLoop ; end DopProductLoop
  4253. MOV RAX, [RBP+matrixC] ; matrixC
  4254. MOV RDX, [RBP+CbFrom] ; CbFrom
  4255. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  4256. MOVUPS [RAX], XMM2
  4257. MOV RAX, [RBP+matrixA] ; matrixA
  4258. ADD RAX, [RBP+StrideA] ; StrideA
  4259. MOV [RBP+matrixA], RAX ; matrixA
  4260. MOV RAX, [RBP+matrixC] ; matrixC
  4261. ADD RAX, [RBP+StrideC] ; StrideC
  4262. MOV [RBP+matrixC], RAX ; matrixC
  4263. INC RCX
  4264. CMP RCX, [RBP+Ra] ; Ra
  4265. JL DotProductSetup ;
  4266. END SSEMul4BlockR;
  4267. PROCEDURE SSEMul2BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  4268. add: BOOLEAN );
  4269. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  4270. MOV RCX, 0 ; counter FOR rows IN A-Ra
  4271. DotProductSetup:
  4272. MOV RAX, 0 ; cols IN A
  4273. MOV RSI, [RBP+matrixA] ; matrixA
  4274. MOV RDI, [RBP+matrixB] ; matrixB
  4275. MOV RDX, [RBP+CbFrom] ; CbFrom
  4276. LEA RDI, [RDI+RDX*8]
  4277. XORPD XMM2, XMM2
  4278. MOV RAX, 0 ;
  4279. MOV AL, [RBP+add] ;
  4280. CMP AL, 0 ; add?
  4281. JE DotProductLoop ;
  4282. MOV RAX, [RBP+matrixC] ; matrixC
  4283. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  4284. MOVUPD XMM2, [RAX]
  4285. MOV RAX, 0
  4286. DotProductLoop:
  4287. ; MOV RDX, [RSI+RAX*4] ;
  4288. ; SHL RDX, 1 ;
  4289. ; CMP RDX, 0
  4290. ; JE SparseEntryEscape
  4291. MOVSD XMM0, [RSI+RAX*8]
  4292. SHUFPD XMM0, XMM0, 0H
  4293. MOVUPD XMM1, [RDI]
  4294. MULPD XMM1, XMM0
  4295. ADDPD XMM2, XMM1
  4296. SparseEntryEscape:
  4297. ADD RDI, [RBP+StrideB] ; StrideB
  4298. INC RAX
  4299. CMP RAX, [RBP+Ca] ; Ca
  4300. JL DotProductLoop ; end DotProductLoop
  4301. MOV RAX , [RBP+matrixC] ; matrixC
  4302. MOV RDX, [RBP+CbFrom] ; CbFrom
  4303. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  4304. MOVUPD [RAX], XMM2
  4305. MOV RAX, [RBP+matrixA] ; matrixA
  4306. ADD RAX, [RBP+StrideA] ; StrideA
  4307. MOV [RBP+matrixA], RAX ; matrixA
  4308. MOV RAX, [RBP+matrixC] ; matrixC
  4309. ADD RAX, [RBP+StrideC] ; StrideC
  4310. MOV [RBP+matrixC], RAX ; matrixC
  4311. INC RCX
  4312. CMP RCX, [RBP+Ra] ; Ra
  4313. JL DotProductSetup ;
  4314. END SSEMul2BlockX;
  4315. (****** blocking matrix multiplication with copy of data ******)
  4316. PROCEDURE MagicBlockR( M, N, K: SIZE;
  4317. VAR L2BlockM, L2BlockN, L2BlockK: SIZE );
  4318. BEGIN
  4319. K := (K DIV L0BlockKR) * L0BlockKR;
  4320. N := (N DIV L1BlockN) * L1BlockN;
  4321. IF M = 0 THEN M := 1 END;
  4322. IF N = 0 THEN N := 1 END;
  4323. IF K = 0 THEN K := 1 END;
  4324. L2BlockK :=
  4325. K DIV ((K + L1MaxBlockKR - 1) DIV L1MaxBlockKR);
  4326. (* Round up to next multiple of 16 *)
  4327. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  4328. L2BlockN :=
  4329. L2BlockSize DIV SIZEOF( REAL ) DIV
  4330. (L2BlockK * (L2BARatio + 1));
  4331. IF L2BlockN > N THEN L2BlockN := N
  4332. ELSIF L2BlockN < 1 THEN L2BlockN := 1;
  4333. END;
  4334. L2BlockM :=
  4335. (L2BlockSize DIV SIZEOF( REAL ) - L2BlockN * L2BlockK) DIV
  4336. L2BlockK;
  4337. (* Round up to next multiple of 5 *)
  4338. IF L2BlockM > M THEN L2BlockM := M
  4339. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  4340. END;
  4341. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  4342. END MagicBlockR;
  4343. PROCEDURE MagicBlockX( M, N, K: SIZE;
  4344. VAR L2BlockM, L2BlockN, L2BlockK:SIZE );
  4345. BEGIN
  4346. K := (K DIV L0BlockKX) * L0BlockKX;
  4347. N := (N DIV L1BlockN) * L1BlockN;
  4348. IF M = 0 THEN M := 1 END;
  4349. IF N = 0 THEN N := 1 END;
  4350. IF K = 0 THEN K := 1 END;
  4351. L2BlockK :=
  4352. K DIV ((K + L1MaxBlockKX - 1) DIV L1MaxBlockKX);
  4353. (* Round up to next multiple of 16 *)
  4354. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  4355. L2BlockN :=
  4356. L2BlockSize DIV SIZEOF( LONGREAL ) DIV
  4357. (L2BlockK * (L2BARatio + 1));
  4358. IF L2BlockN > N THEN L2BlockN := N END;
  4359. L2BlockM :=
  4360. (L2BlockSize DIV SIZEOF( LONGREAL ) - L2BlockN * L2BlockK) DIV
  4361. L2BlockK;
  4362. (* Round up to next multiple of 5 *)
  4363. IF L2BlockM > M THEN L2BlockM := M
  4364. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  4365. END;
  4366. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  4367. END MagicBlockX;
  4368. (*
  4369. PROCEDURE L1Block1X( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4370. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  4371. PROCEDURE null( i: LONGINT );
  4372. BEGIN
  4373. reg[i, 0] := 0; reg[i, 1] := 0;
  4374. END null;
  4375. PROCEDURE get1( adr, i: LONGINT );
  4376. BEGIN
  4377. SYSTEM.GET( adr, reg[i, 0] );
  4378. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4379. END get1;
  4380. PROCEDURE get2( adr, i: LONGINT );
  4381. BEGIN
  4382. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  4383. IF debug THEN
  4384. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4385. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  4386. END;
  4387. END get2;
  4388. PROCEDURE mul2( i, j: LONGINT );
  4389. BEGIN
  4390. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4391. END mul2;
  4392. PROCEDURE add2( i, j: LONGINT );
  4393. BEGIN
  4394. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4395. END add2;
  4396. PROCEDURE put1( adr, i: LONGINT );
  4397. BEGIN
  4398. SYSTEM.PUT( adr, reg[i, 0] );
  4399. END put1;
  4400. PROCEDURE horadd( i: LONGINT );
  4401. BEGIN
  4402. reg[i, 0] := reg[i, 0] + reg[i, 1];
  4403. END horadd;
  4404. BEGIN
  4405. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  4406. null( 2 ); get1( adrC, 2 );
  4407. WHILE (K > 0) DO (* padding guaranteed *)
  4408. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 ); INC( adrB, 16 );
  4409. INC( adrA, 16 ); DEC( K, 2 );
  4410. END;
  4411. horadd( 2 ); put1( adrC, 2 );
  4412. END L1Block1X;
  4413. PROCEDURE L1Block5X( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4414. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  4415. PROCEDURE null( i: LONGINT );
  4416. BEGIN
  4417. reg[i, 0] := 0; reg[i, 1] := 0;
  4418. END null;
  4419. PROCEDURE get1( adr, i: LONGINT );
  4420. BEGIN
  4421. SYSTEM.GET( adr, reg[i, 0] );
  4422. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4423. END get1;
  4424. PROCEDURE get2( adr, i: LONGINT );
  4425. BEGIN
  4426. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  4427. IF debug THEN
  4428. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4429. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  4430. END;
  4431. END get2;
  4432. PROCEDURE mul2( i, j: LONGINT );
  4433. BEGIN
  4434. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4435. END mul2;
  4436. PROCEDURE add2( i, j: LONGINT );
  4437. BEGIN
  4438. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4439. END add2;
  4440. PROCEDURE put1( adr, i: LONGINT );
  4441. BEGIN
  4442. SYSTEM.PUT( adr, reg[i, 0] );
  4443. END put1;
  4444. PROCEDURE horadd( i: LONGINT );
  4445. BEGIN
  4446. reg[i, 0] := reg[i, 0] + reg[i, 1];
  4447. END horadd;
  4448. BEGIN
  4449. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  4450. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  4451. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  4452. get1( adrC + 4 * IncC, 6 );
  4453. WHILE (K > 0) DO (* padding guaranteed *)
  4454. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 );
  4455. get2( adrB + 16, 0 ); mul2( 0, 7 ); add2( 3, 0 ); get2( adrB + 32, 0 );
  4456. mul2( 0, 7 ); add2( 4, 0 ); get2( adrB + 48, 0 ); mul2( 0, 7 );
  4457. add2( 5, 0 ); get2( adrB + 64, 0 ); mul2( 0, 7 ); add2( 6, 0 ); INC( adrB, 80 );
  4458. INC( adrA, 16 ); DEC( K, 2 );
  4459. END;
  4460. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  4461. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  4462. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  4463. END L1Block5X;
  4464. PROCEDURE L1Block1R( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4465. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  4466. PROCEDURE null( i: LONGINT );
  4467. BEGIN
  4468. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  4469. END null;
  4470. PROCEDURE get1( adr, i: LONGINT );
  4471. BEGIN
  4472. SYSTEM.GET( adr, reg[i, 0] );
  4473. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4474. END get1;
  4475. PROCEDURE get4( adr, i: LONGINT );
  4476. BEGIN
  4477. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  4478. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  4479. IF debug THEN
  4480. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4481. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  4482. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  4483. END;
  4484. END get4;
  4485. PROCEDURE mul4( i, j: LONGINT );
  4486. BEGIN
  4487. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4488. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  4489. END mul4;
  4490. PROCEDURE add4( i, j: LONGINT );
  4491. BEGIN
  4492. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4493. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  4494. END add4;
  4495. PROCEDURE put1( adr, i: LONGINT );
  4496. BEGIN
  4497. SYSTEM.PUT( adr, reg[i, 0] );
  4498. END put1;
  4499. PROCEDURE horadd( i: LONGINT );
  4500. BEGIN
  4501. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  4502. END horadd;
  4503. BEGIN
  4504. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  4505. null( 2 ); get1( adrC, 2 );
  4506. WHILE (K > 0) DO (* padding guaranteed *)
  4507. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 ); INC( adrB, 16 );
  4508. INC( adrA, 16 ); DEC( K, 4 );
  4509. END;
  4510. horadd( 2 ); put1( adrC, 2 );
  4511. END L1Block1R;
  4512. PROCEDURE L1Block5R( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4513. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  4514. PROCEDURE null( i: LONGINT );
  4515. BEGIN
  4516. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  4517. END null;
  4518. PROCEDURE get1( adr, i: LONGINT );
  4519. BEGIN
  4520. SYSTEM.GET( adr, reg[i, 0] );
  4521. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4522. END get1;
  4523. PROCEDURE get4( adr, i: LONGINT );
  4524. BEGIN
  4525. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  4526. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  4527. IF debug THEN
  4528. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4529. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  4530. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  4531. END;
  4532. END get4;
  4533. PROCEDURE mul4( i, j: LONGINT );
  4534. BEGIN
  4535. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4536. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  4537. END mul4;
  4538. PROCEDURE add4( i, j: LONGINT );
  4539. BEGIN
  4540. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4541. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  4542. END add4;
  4543. PROCEDURE put1( adr, i: LONGINT );
  4544. BEGIN
  4545. SYSTEM.PUT( adr, reg[i, 0] );
  4546. END put1;
  4547. PROCEDURE horadd( i: LONGINT );
  4548. BEGIN
  4549. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  4550. END horadd;
  4551. BEGIN
  4552. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  4553. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  4554. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  4555. get1( adrC + 4 * IncC, 6 );
  4556. WHILE (K > 0) DO (* padding guaranteed *)
  4557. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 );
  4558. get4( adrB + 16, 0 ); mul4( 0, 7 ); add4( 3, 0 ); get4( adrB + 32, 0 );
  4559. mul4( 0, 7 ); add4( 4, 0 ); get4( adrB + 48, 0 ); mul4( 0, 7 );
  4560. add4( 5, 0 ); get4( adrB + 64, 0 ); mul4( 0, 7 ); add4( 6, 0 ); INC( adrB, 80 );
  4561. INC( adrA, 16 ); DEC( K, 4 );
  4562. END;
  4563. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  4564. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  4565. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  4566. END L1Block5R;
  4567. *)
  4568. PROCEDURE DispCR( adrM: ADDRESS;
  4569. inc, stride, M, N: SIZE );
  4570. VAR i, j: SIZE; adr: ADDRESS; val: REAL;
  4571. BEGIN
  4572. FOR i := 0 TO M - 1 DO
  4573. adr := adrM + i * stride;
  4574. FOR j := 0 TO N - 1 DO
  4575. SYSTEM.GET( adr, val );
  4576. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  4577. END;
  4578. KernelLog.Ln;
  4579. END;
  4580. END DispCR;
  4581. PROCEDURE DispCX( adrM: ADDRESS;
  4582. inc, stride, M, N: SIZE );
  4583. VAR i, j: SIZE; adr: ADDRESS; val: LONGREAL;
  4584. BEGIN
  4585. FOR i := 0 TO M - 1 DO
  4586. adr := adrM + i * stride;
  4587. FOR j := 0 TO N - 1 DO
  4588. SYSTEM.GET( adr, val );
  4589. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  4590. END;
  4591. KernelLog.Ln;
  4592. END;
  4593. END DispCX;
  4594. PROCEDURE L3BlockX( matrixA, matrixB, matrixC: ADDRESS;
  4595. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4596. (*
  4597. K N
  4598. *** N *****
  4599. M *** ****** -> ***** M
  4600. *** K ****** *****
  4601. *** ****** *****
  4602. A * B -> C
  4603. *)
  4604. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4605. KAligned: SIZE;
  4606. CONST Size = SIZEOF( LONGREAL );
  4607. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4608. (* M,N and K arbitrary ! *)
  4609. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4610. m, k, KAligned: SIZE;
  4611. BEGIN
  4612. KAligned := Align2( K ) * 8;
  4613. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4614. END;
  4615. adrB := matrixB;
  4616. WHILE (N >= L1BlockN) DO
  4617. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4618. adrC := matrixC; adrA := matrixA; m := M;
  4619. WHILE (m > 0) DO
  4620. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4621. IF SSE THEN
  4622. L1Block5XSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4623. ELSE
  4624. aadrA := adrA; aadrB := adrB; k := K;
  4625. WHILE (k > 0) DO
  4626. L1Block1XA( aadrA, aadrB, adrC, 2 );
  4627. L1Block1XA( aadrA, aadrB + 16, adrC + incC, 2 );
  4628. L1Block1XA( aadrA, aadrB + 32, adrC + 2 * incC,
  4629. 2 );
  4630. L1Block1XA( aadrA, aadrB + 48, adrC + 3 * incC,
  4631. 2 );
  4632. L1Block1XA( aadrA, aadrB + 64, adrC + 4 * incC,
  4633. 2 );
  4634. DEC( k, 2 ); INC( aadrA, 16 );
  4635. INC( aadrB, 16 * L1BlockN );
  4636. END;
  4637. END;
  4638. IF debug THEN
  4639. DispCX( matrixC, incC, strideC, M, N );
  4640. END;
  4641. INC( adrA, KAligned ); INC( adrC, strideC );
  4642. DEC( m );
  4643. END;
  4644. INC( matrixC, L1BlockN * incC );
  4645. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4646. END;
  4647. WHILE (N > 0) DO
  4648. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4649. adrC := matrixC; adrA := matrixA; m := M;
  4650. WHILE (m > 0) DO
  4651. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4652. IF SSE THEN
  4653. L1Block1XSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4654. ELSE L1Block1XA( adrA, adrB, adrC, K );
  4655. END;
  4656. IF debug THEN
  4657. DispCX( matrixC, incC, strideC, M, N );
  4658. END;
  4659. INC( adrA, KAligned ); INC( adrC, strideC );
  4660. DEC( m );
  4661. END;
  4662. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4663. END;
  4664. END L2Block;
  4665. BEGIN
  4666. KAligned := Align2( K ) * 8;
  4667. ASSERT( L2BlockK MOD 2 = 0 );
  4668. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4669. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4670. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4671. WHILE (n >= L2BlockN) DO
  4672. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4673. a1 := matrixA; adrC := matrixC; m := M;
  4674. WHILE (m >= L2BlockM) DO
  4675. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4676. adrA := a1; adrB := b1; k := K;
  4677. (* core: do matching level 2 cache Blocks *)
  4678. WHILE (k >= L2BlockK) DO
  4679. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4680. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4681. L2BlockK );
  4682. INC( adrA, L2BlockK * L2BlockM * Size );
  4683. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4684. DEC( k, L2BlockK );
  4685. END;
  4686. (* core: do rest of k *)
  4687. IF k > 0 THEN
  4688. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4689. END;
  4690. INC( a1, KAligned * L2BlockM );
  4691. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4692. END;
  4693. IF m > 0 THEN
  4694. (* clean up M *)
  4695. adrA := a1; adrB := b1; k := K;
  4696. WHILE (k >= L2BlockK) DO
  4697. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4698. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4699. INC( adrA, L2BlockK * Size * m );
  4700. INC( adrB, L2BlockK * L2BlockN * Size );
  4701. DEC( k, L2BlockK );
  4702. END;
  4703. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4704. IF k > 0 THEN
  4705. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4706. END;
  4707. END;
  4708. INC( b1, L2BlockN * KAligned );
  4709. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4710. END;
  4711. IF (n = 0) THEN RETURN
  4712. END;
  4713. a1 := matrixA; adrC := matrixC; m := M;
  4714. WHILE (m >= L2BlockM) DO
  4715. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4716. adrA := a1; adrB := b1; k := K;
  4717. WHILE (k >= L2BlockK) DO
  4718. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4719. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4720. INC( adrA, L2BlockM * L2BlockK * Size );
  4721. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4722. END;
  4723. IF k > 0 THEN
  4724. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4725. END;
  4726. INC( a1, L2BlockM * KAligned );
  4727. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4728. END;
  4729. IF (m = 0) THEN RETURN
  4730. END;
  4731. adrA := a1; adrB := b1; k := K;
  4732. WHILE (k >= L2BlockK) DO
  4733. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4734. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4735. INC( adrA, L2BlockK * m * Size );
  4736. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4737. END;
  4738. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4739. END;
  4740. END L3BlockX;
  4741. PROCEDURE L3BlockR( matrixA, matrixB, matrixC: ADDRESS;
  4742. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4743. (*
  4744. K N
  4745. *** N *****
  4746. M *** ****** -> ***** M
  4747. *** K ****** *****
  4748. *** ****** *****
  4749. A * B -> C
  4750. *)
  4751. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4752. KAligned: SIZE;
  4753. CONST Size = SIZEOF( REAL );
  4754. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4755. (* M,N and K arbitrary ! *)
  4756. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4757. m, KAligned, k: SIZE;
  4758. BEGIN
  4759. KAligned := Align4( K ) * 4;
  4760. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4761. END;
  4762. adrB := matrixB;
  4763. WHILE (N >= L1BlockN) DO
  4764. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4765. adrC := matrixC; adrA := matrixA; m := M;
  4766. WHILE (m > 0) DO
  4767. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4768. IF SSE THEN
  4769. L1Block5RSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4770. ELSE
  4771. aadrA := adrA; aadrB := adrB; k := K;
  4772. WHILE (k > 0) DO
  4773. L1Block1RA( aadrA, aadrB, adrC, 4 );
  4774. L1Block1RA( aadrA, aadrB + 16, adrC + incC, 4 );
  4775. L1Block1RA( aadrA, aadrB + 32, adrC + 2 * incC,
  4776. 4 );
  4777. L1Block1RA( aadrA, aadrB + 48, adrC + 3 * incC,
  4778. 4 );
  4779. L1Block1RA( aadrA, aadrB + 64, adrC + 4 * incC,
  4780. 4 );
  4781. DEC( k, 4 ); INC( aadrA, 16 );
  4782. INC( aadrB, 16 * L1BlockN );
  4783. END;
  4784. END;
  4785. IF debug THEN
  4786. DispCR( matrixC, incC, strideC, M, N );
  4787. END;
  4788. INC( adrA, KAligned ); INC( adrC, strideC );
  4789. DEC( m );
  4790. END;
  4791. INC( matrixC, L1BlockN * incC );
  4792. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4793. END;
  4794. WHILE (N > 0) DO
  4795. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4796. adrC := matrixC; adrA := matrixA; m := M;
  4797. WHILE (m > 0) DO
  4798. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4799. IF SSE THEN
  4800. L1Block1RSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4801. ELSE L1Block1RA( adrA, adrB, adrC, K );
  4802. END;
  4803. IF debug THEN
  4804. DispCR( matrixC, incC, strideC, M, N );
  4805. END;
  4806. INC( adrA, KAligned ); INC( adrC, strideC );
  4807. DEC( m );
  4808. END;
  4809. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4810. END;
  4811. END L2Block;
  4812. BEGIN
  4813. KAligned := Align4( K ) * 4;
  4814. ASSERT( L2BlockK MOD 4 = 0 );
  4815. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4816. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4817. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4818. WHILE (n >= L2BlockN) DO
  4819. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4820. a1 := matrixA; adrC := matrixC; m := M;
  4821. WHILE (m >= L2BlockM) DO
  4822. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4823. adrA := a1; adrB := b1; k := K;
  4824. (* core: do matching level 2 cache Blocks *)
  4825. WHILE (k >= L2BlockK) DO
  4826. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4827. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4828. L2BlockK );
  4829. INC( adrA, L2BlockK * L2BlockM * Size );
  4830. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4831. DEC( k, L2BlockK );
  4832. END;
  4833. (* core: do rest of k *)
  4834. IF k > 0 THEN
  4835. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4836. END;
  4837. INC( a1, KAligned * L2BlockM );
  4838. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4839. END;
  4840. IF m > 0 THEN
  4841. (* clean up M *)
  4842. adrA := a1; adrB := b1; k := K;
  4843. WHILE (k >= L2BlockK) DO
  4844. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4845. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4846. INC( adrA, L2BlockK * Size * m );
  4847. INC( adrB, L2BlockK * L2BlockN * Size );
  4848. DEC( k, L2BlockK );
  4849. END;
  4850. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4851. IF k > 0 THEN
  4852. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4853. END;
  4854. END;
  4855. INC( b1, L2BlockN * KAligned );
  4856. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4857. END;
  4858. IF (n = 0) THEN RETURN
  4859. END;
  4860. a1 := matrixA; adrC := matrixC; m := M;
  4861. WHILE (m >= L2BlockM) DO
  4862. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4863. adrA := a1; adrB := b1; k := K;
  4864. WHILE (k >= L2BlockK) DO
  4865. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4866. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4867. INC( adrA, L2BlockM * L2BlockK * Size );
  4868. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4869. END;
  4870. IF k > 0 THEN
  4871. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4872. END;
  4873. INC( a1, L2BlockM * KAligned );
  4874. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4875. END;
  4876. IF (m = 0) THEN RETURN
  4877. END;
  4878. adrA := a1; adrB := b1; k := K;
  4879. WHILE (k >= L2BlockK) DO
  4880. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4881. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4882. INC( adrA, L2BlockK * m * Size );
  4883. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4884. END;
  4885. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4886. END;
  4887. END L3BlockR;
  4888. PROCEDURE Align( adr: ADDRESS; align: SIZE ): ADDRESS;
  4889. BEGIN
  4890. RETURN adr + (-adr) MOD align; (* 128 bit = 16 byte alignment *)
  4891. END Align;
  4892. PROCEDURE CopyAX( matrixA, dest: ADDRESS;
  4893. IncA, StrideA: SIZE;
  4894. K, M, L2BlockK, L2BlockM: SIZE );
  4895. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4896. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4897. VAR rest: SIZE;
  4898. BEGIN
  4899. IF debug THEN
  4900. KernelLog.String( "CopyMK:" ); KernelLog.Int( M, 10 ); KernelLog.Int( K, 10 );
  4901. KernelLog.Ln;
  4902. END;
  4903. rest := (-K) MOD 2;
  4904. WHILE (M > 0) DO
  4905. MovX( matrixA, dest, IncA, K ); INC( dest, K * 8 );
  4906. IF rest # 0 THEN
  4907. ZeroX( dest, rest ); INC( dest, 8 * rest );
  4908. END;
  4909. INC( matrixA, StrideA ); DEC( M );
  4910. END;
  4911. END CopyMK;
  4912. BEGIN
  4913. Tic( t ); m := M;
  4914. WHILE (m >= L2BlockM) DO
  4915. k := K; adrA := matrixA;
  4916. WHILE (k >= L2BlockK) DO
  4917. CopyMK( adrA, L2BlockM, L2BlockK );
  4918. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4919. END;
  4920. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4921. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4922. END;
  4923. adrA := matrixA; k := K;
  4924. WHILE (k >= L2BlockK) DO
  4925. CopyMK( adrA, m, L2BlockK );
  4926. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4927. END;
  4928. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4929. Toc( t, copyT );
  4930. END CopyAX;
  4931. PROCEDURE CopyAR( matrixA, dest: ADDRESS;
  4932. IncA, StrideA: SIZE;
  4933. K, M, L2BlockK, L2BlockM: SIZE );
  4934. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4935. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4936. VAR rest: SIZE;
  4937. BEGIN
  4938. rest := (-K) MOD 4;
  4939. WHILE (M > 0) DO
  4940. MovR( matrixA, dest, IncA, K ); INC( dest, K * 4 );
  4941. IF rest # 0 THEN
  4942. ZeroR( dest, rest ); INC( dest, 4 * rest );
  4943. END;
  4944. INC( matrixA, StrideA ); DEC( M );
  4945. END;
  4946. END CopyMK;
  4947. BEGIN
  4948. Tic( t ); m := M;
  4949. WHILE (m >= L2BlockM) DO
  4950. k := K; adrA := matrixA;
  4951. WHILE (k >= L2BlockK) DO
  4952. CopyMK( adrA, L2BlockM, L2BlockK );
  4953. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4954. END;
  4955. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4956. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4957. END;
  4958. adrA := matrixA; k := K;
  4959. WHILE (k >= L2BlockK) DO
  4960. CopyMK( adrA, m, L2BlockK );
  4961. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4962. END;
  4963. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4964. Toc( t, copyT );
  4965. END CopyAR;
  4966. PROCEDURE CopyBX( matrixB, dest: ADDRESS;
  4967. IncB, StrideB: SIZE;
  4968. N, K, L2BlockN, L2BlockK: SIZE );
  4969. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4970. PROCEDURE Copy5x2k( matrixB: ADDRESS; k: SIZE );
  4971. VAR i: SIZE; adrB: ADDRESS; rest: SIZE;
  4972. BEGIN
  4973. rest := (-k) MOD 2;
  4974. WHILE (k >= 2) DO (* store 5x4 Block in line *)
  4975. adrB := matrixB;
  4976. FOR i := 1 TO L1BlockN DO
  4977. MovX( adrB, dest, StrideB, 2 ); INC( dest, 16 );
  4978. INC( adrB, IncB );
  4979. END;
  4980. INC( matrixB, 2 * StrideB ); DEC( k, 2 );
  4981. END;
  4982. IF k > 0 THEN
  4983. adrB := matrixB;
  4984. FOR i := 1 TO L1BlockN DO
  4985. MovX( adrB, dest, StrideB, k ); INC( dest, 8 * k );
  4986. IF rest # 0 THEN
  4987. ZeroX( dest, rest ); INC( dest, rest * 8 );
  4988. END;
  4989. INC( adrB, IncB );
  4990. END;
  4991. END;
  4992. END Copy5x2k;
  4993. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4994. VAR n, rest: SIZE;
  4995. BEGIN
  4996. rest := (-K) MOD 2;
  4997. IF debug THEN
  4998. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4999. END;
  5000. n := N;
  5001. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  5002. Copy5x2k( matrixB, K );
  5003. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  5004. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  5005. END;
  5006. IF debug THEN KernelLog.String( "Copy1, n=" ); KernelLog.Int( n, 10 ); KernelLog.Ln;
  5007. END;
  5008. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  5009. MovX( matrixB, dest, StrideB, K ); INC( dest, K * 8 );
  5010. ZeroR( dest, rest ); INC( dest, rest * 8 );
  5011. INC( matrixB, IncB ); DEC( n );
  5012. END;
  5013. END Copy1;
  5014. BEGIN
  5015. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  5016. ASSERT( L2BlockK MOD 2 = 0 ); n := N;
  5017. WHILE (n >= L2BlockN) DO
  5018. k := K; adrB := matrixB;
  5019. WHILE (k >= L2BlockK) DO
  5020. Copy1( adrB, L2BlockK, L2BlockN );
  5021. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  5022. END;
  5023. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  5024. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  5025. END;
  5026. IF (n = 0) THEN RETURN
  5027. END;
  5028. k := K; adrB := matrixB;
  5029. WHILE (k >= L2BlockK) DO
  5030. Copy1( adrB, L2BlockK, n );
  5031. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  5032. END;
  5033. Copy1( adrB, k, n ); Toc( t, copyT );
  5034. END CopyBX;
  5035. PROCEDURE CopyBR( matrixB, dest: ADDRESS;
  5036. IncB, StrideB: SIZE;
  5037. N, K, L2BlockN, L2BlockK: SIZE );
  5038. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  5039. PROCEDURE Copy5x4k( matrixB: ADDRESS; k: SIZE );
  5040. VAR i: SIZE; adrB: ADDRESS; rest, k4: SIZE;
  5041. BEGIN
  5042. k4 := k - k MOD 4; rest := (-k) MOD 4;
  5043. IF k4 > 0 THEN
  5044. MovR5( matrixB, IncB, StrideB, dest, k4 );
  5045. INC( matrixB, k4 * StrideB ); INC( dest, k4 * 80 DIV 4 );
  5046. DEC( k, k4 );
  5047. END;
  5048. (*
  5049. WHILE (k >= 4) DO (* store 5x4 Block in line *)
  5050. adrB := matrixB;
  5051. FOR i := 1 TO L1BlockN DO
  5052. MovR( adrB, dest, StrideB, 4 ); INC( dest, 16 ); INC( adrB, IncB );
  5053. END;
  5054. INC( matrixB, 4 * StrideB ); DEC( k, 4 );
  5055. END;
  5056. *)
  5057. IF k > 0 THEN
  5058. adrB := matrixB;
  5059. FOR i := 1 TO L1BlockN DO
  5060. MovR( adrB, dest, StrideB, k ); INC( dest, 4 * k );
  5061. IF rest # 0 THEN
  5062. ZeroR( dest, rest ); INC( dest, rest * 4 );
  5063. END;
  5064. INC( adrB, IncB );
  5065. END;
  5066. END;
  5067. END Copy5x4k;
  5068. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  5069. VAR n, rest: SIZE;
  5070. BEGIN
  5071. rest := (-K) MOD 4;
  5072. IF debug THEN
  5073. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  5074. END;
  5075. n := N;
  5076. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  5077. Copy5x4k( matrixB, K );
  5078. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  5079. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  5080. END;
  5081. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  5082. MovR( matrixB, dest, StrideB, K ); INC( dest, K * 4 );
  5083. ZeroR( dest, rest ); INC( dest, rest * 4 );
  5084. INC( matrixB, IncB ); DEC( n );
  5085. END;
  5086. END Copy1;
  5087. BEGIN
  5088. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  5089. ASSERT( L2BlockK MOD 4 = 0 ); n := N;
  5090. WHILE (n >= L2BlockN) DO
  5091. k := K; adrB := matrixB;
  5092. WHILE (k >= L2BlockK) DO
  5093. Copy1( adrB, L2BlockK, L2BlockN );
  5094. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  5095. END;
  5096. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  5097. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  5098. END;
  5099. IF (n = 0) THEN RETURN
  5100. END;
  5101. k := K; adrB := matrixB;
  5102. WHILE (k >= L2BlockK) DO
  5103. Copy1( adrB, L2BlockK, n );
  5104. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  5105. END;
  5106. Copy1( adrB, k, n ); Toc( t, copyT );
  5107. END CopyBR;
  5108. (*
  5109. PROCEDURE FillMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  5110. VAR i, j: LONGINT;
  5111. BEGIN
  5112. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  5113. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  5114. A[i, j] := ran.Dice( 10 );
  5115. IF debug THEN A[i, j] := 10 * i + j; END;
  5116. END;
  5117. END;
  5118. END FillMR;
  5119. PROCEDURE DispMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  5120. VAR i, j: LONGINT;
  5121. BEGIN
  5122. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  5123. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  5124. KernelLog.Ln;
  5125. END;
  5126. END DispMR;
  5127. PROCEDURE FillMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  5128. VAR i, j: LONGINT;
  5129. BEGIN
  5130. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  5131. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  5132. A[i, j] := ran.Dice( 10 );
  5133. IF debug THEN A[i, j] := 10 * i + j; END;
  5134. END;
  5135. END;
  5136. END FillMX;
  5137. PROCEDURE DispMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  5138. VAR i, j: LONGINT;
  5139. BEGIN
  5140. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  5141. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  5142. KernelLog.Ln;
  5143. END;
  5144. END DispMX;
  5145. *)
  5146. PROCEDURE Tic( VAR t: HUGEINT );
  5147. BEGIN
  5148. t := Machine.GetTimer();
  5149. END Tic;
  5150. PROCEDURE Toc( VAR t, addto: HUGEINT );
  5151. BEGIN
  5152. INC( addto, Machine.GetTimer() - t ); t := Machine.GetTimer();
  5153. END Toc;
  5154. PROCEDURE MultiplyX( A, B, C: ADDRESS;
  5155. M, N, K, L2BlockM, L2BlockN, L2BlockK:SIZE;
  5156. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  5157. add: BOOLEAN );
  5158. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  5159. M1, M2, i: SIZE; val: LONGREAL; t: HUGEINT;
  5160. inc: SIZE;
  5161. obj: POINTER TO ARRAY OF MultiplyObjectX;
  5162. cache: Cache;
  5163. BEGIN
  5164. NEW(obj,nrProcesses+1);
  5165. lenA := M * Align2( K ) * 8; lenB := N * Align2( K ) * 8;
  5166. cache := cachePool.Acquire( lenA + lenB );
  5167. adrA := cache.adr; adrB := adrA + lenA;
  5168. CopyAX( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  5169. CopyBX( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  5170. Tic( t ); m := M; adrC := C;
  5171. IF ~add THEN
  5172. WHILE (m > 0) DO
  5173. ZeroXI( adrC, IncC, N ); INC( adrC, StrideC ); DEC( m );
  5174. END;
  5175. END;
  5176. Toc( t, zeroT );
  5177. IF debug THEN
  5178. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  5179. FOR i := 0 TO M * Align2( K ) - 1 DO
  5180. SYSTEM.GET( adrA + i * 8, val );
  5181. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  5182. END;
  5183. END;
  5184. IF debug THEN
  5185. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  5186. FOR i := 0 TO N * Align2( K ) - 1 DO
  5187. SYSTEM.GET( adrB + i * 8, val );
  5188. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  5189. END;
  5190. END;
  5191. IF parallel & (M > L2BlockM) THEN
  5192. inc := Align( MAX(M DIV nrProcesses,L2BlockM), L2BlockM ); M1 := 0;
  5193. i := 0;
  5194. WHILE (M1 < M) DO
  5195. M2 := M1 + inc;
  5196. IF M2 > M THEN M2 := M END;
  5197. NEW( obj[i], adrA + M1 * Align2( K ) * 8, adrB,
  5198. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  5199. L2BlockM, L2BlockN, L2BlockK );
  5200. M1 := M2; INC( i );
  5201. END;
  5202. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  5203. ELSE
  5204. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  5205. L2BlockN, L2BlockK );
  5206. END;
  5207. Toc( t, compT ); cachePool.Release( cache );
  5208. END MultiplyX;
  5209. PROCEDURE MultiplyR( A, B, C: ADDRESS;
  5210. M, N, K, L2BlockM, L2BlockN, L2BlockK: SIZE;
  5211. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  5212. add: BOOLEAN );
  5213. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  5214. M1, M2, i: SIZE; val: REAL; inc: SIZE;
  5215. obj: POINTER TO ARRAY OF MultiplyObjectR;
  5216. t: HUGEINT; cache: Cache;
  5217. BEGIN
  5218. NEW(obj,nrProcesses+1);
  5219. lenA := M * Align4( K ) * 4; lenB := N * Align4( K ) * 4;
  5220. cache := cachePool.Acquire( lenA + lenB );
  5221. adrA := cache.adr; adrB := adrA + lenA;
  5222. CopyAR( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  5223. CopyBR( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  5224. Tic( t ); m := M; adrC := C;
  5225. IF ~add THEN
  5226. WHILE (m > 0) DO
  5227. ZeroRI( adrC, IncC, N ); INC( adrC, StrideC );
  5228. DEC( m );
  5229. END;
  5230. END;
  5231. Toc( t, zeroT );
  5232. IF debug THEN
  5233. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  5234. FOR i := 0 TO M * Align4( K ) - 1 DO
  5235. SYSTEM.GET( adrA + i * 4, val );
  5236. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  5237. END;
  5238. END;
  5239. IF debug THEN
  5240. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  5241. FOR i := 0 TO N * Align4( K ) - 1 DO
  5242. SYSTEM.GET( adrB + i * 4, val );
  5243. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  5244. END;
  5245. END;
  5246. IF parallel & (M > L2BlockM) THEN
  5247. inc := Align( MAX(M DIV nrProcesses,L2BlockM), L2BlockM ); M1 := 0;
  5248. i := 0;
  5249. WHILE (M1 < M) DO
  5250. M2 := M1 + inc;
  5251. IF M2 > M THEN M2 := M END;
  5252. NEW( obj[i], adrA + M1 * Align4( K ) * 4, adrB,
  5253. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  5254. L2BlockM, L2BlockN, L2BlockK );
  5255. M1 := M2; INC( i );
  5256. END;
  5257. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  5258. ELSE
  5259. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  5260. L2BlockN, L2BlockK );
  5261. END;
  5262. Toc( t, compT ); cachePool.Release( cache );
  5263. END MultiplyR;
  5264. (*
  5265. PROCEDURE DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  5266. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  5267. A, B, C, D: ARRAY [ .. , .. ] OF LONGREAL;
  5268. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  5269. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: LONGREAL; atime, time: LONGINT;
  5270. BEGIN
  5271. KernelLog.String( "LONGREAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  5272. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  5273. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  5274. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  5275. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMX( A ); FillMX( B );
  5276. IF debug THEN DispMX( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMX( B );
  5277. END;
  5278. atime := Input.Time(); (* C := 0; *)
  5279. WHILE (iter > 0) DO
  5280. MultiplyX( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  5281. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  5282. (*
  5283. 8,
  5284. LEN( A, 1 ) * 8, 8, LEN( B, 1 ) * 8, 8, LEN( C, 1 ) * 8
  5285. *)
  5286. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  5287. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  5288. );
  5289. DEC( iter );
  5290. END;
  5291. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  5292. IF debug THEN
  5293. DispMX( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMX( B ); KernelLog.String( " = " );
  5294. KernelLog.Ln; DispMX( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  5295. END;
  5296. IF check THEN
  5297. (*
  5298. NEW(D,M,N);
  5299. MatMulAXAXNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  5300. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  5301. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  5302. *)
  5303. D := A * B;
  5304. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  5305. END;
  5306. END DoTestX;
  5307. PROCEDURE DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  5308. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  5309. A, B, C, D: ARRAY [ .. , .. ] OF REAL;
  5310. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  5311. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: REAL; atime, time: LONGINT;
  5312. BEGIN
  5313. KernelLog.String( "REAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  5314. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  5315. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  5316. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  5317. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMR( A ); FillMR( B );
  5318. IF debug THEN DispMR( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMR( B );
  5319. END;
  5320. atime := Input.Time(); (* C := 0; *)
  5321. FOR i := 1 TO iter DO
  5322. MultiplyR( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  5323. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  5324. (* 4,
  5325. LEN( A, 1 ) * 4, 4, LEN( B, 1 ) * 4, 4, LEN( C, 1 ) * 4 *)
  5326. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  5327. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  5328. );
  5329. END;
  5330. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  5331. IF debug THEN
  5332. DispMR( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMR( B ); KernelLog.String( " = " );
  5333. KernelLog.Ln; DispMR( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  5334. END;
  5335. IF check THEN
  5336. (*
  5337. NEW(D,M,N);
  5338. MatMulARARNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  5339. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  5340. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  5341. *)
  5342. D := A * B;
  5343. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  5344. END;
  5345. END DoTestR;
  5346. PROCEDURE RandTestR*;
  5347. VAR iter, i, time: LONGINT;
  5348. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  5349. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  5350. BEGIN
  5351. IF Min = Max THEN RETURN Min
  5352. ELSE RETURN ran.Dice( Max - Min ) + Min
  5353. END;
  5354. END Ran;
  5355. BEGIN
  5356. In.Open(); In.LongInt( iter );
  5357. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  5358. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  5359. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  5360. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  5361. K := Ran( MinK, MaxK );
  5362. IF N < 5 THEN N := 5 END;
  5363. IF K < 4 THEN K := 4 END;
  5364. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  5365. BN := Align( BN, 5 );
  5366. IF BN > N THEN DEC( BN, 5 ) END;
  5367. BK := Align( BK, 4 );
  5368. IF BK > K THEN DEC( BK, 4 ) END;
  5369. DoTestR( M, N, K, BM, BN, BK, TRUE , 1 );
  5370. END;
  5371. END RandTestR;
  5372. PROCEDURE RandTestX*;
  5373. VAR iter, i, time: LONGINT;
  5374. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  5375. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  5376. BEGIN
  5377. IF Min = Max THEN RETURN Min
  5378. ELSE RETURN ran.Dice( Max - Min ) + Min
  5379. END;
  5380. END Ran;
  5381. BEGIN
  5382. In.Open(); In.LongInt( iter );
  5383. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  5384. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  5385. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  5386. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  5387. K := Ran( MinK, MaxK );
  5388. IF N < 5 THEN N := 5 END;
  5389. IF K < 4 THEN K := 4 END;
  5390. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  5391. BN := Align( BN, 5 );
  5392. IF BN > N THEN DEC( BN, 5 ) END;
  5393. BK := Align( BK, 4 );
  5394. IF BK > K THEN DEC( BK, 4 ) END;
  5395. DoTestX( M, N, K, BM, BN, BK, TRUE , 1 );
  5396. END;
  5397. END RandTestX;
  5398. *)
  5399. (*
  5400. PROCEDURE Times*;
  5401. VAR all: HUGEINT;
  5402. BEGIN
  5403. all := allocT + copyT + zeroT + compT; KernelLog.String( "alloc=" );
  5404. KernelLog.LongRealFix( allocT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5405. KernelLog.Int( ENTIER( 100 * allocT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5406. KernelLog.Ln; KernelLog.String( "copy=" );
  5407. KernelLog.LongRealFix( copyT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5408. KernelLog.Int( ENTIER( 100 * copyT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5409. KernelLog.Ln; KernelLog.String( "zero=" );
  5410. KernelLog.LongRealFix( zeroT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5411. KernelLog.Int( ENTIER( 100 * zeroT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5412. KernelLog.Ln; KernelLog.String( "comp=" );
  5413. KernelLog.LongRealFix( compT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5414. KernelLog.Int( ENTIER( 100 * compT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5415. KernelLog.Ln;
  5416. END Times;
  5417. *)
  5418. (*
  5419. PROCEDURE TestRMM*;
  5420. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  5421. check, iter: LONGINT;
  5422. BEGIN
  5423. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  5424. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  5425. In.LongInt( iter ); In.LongInt( check );
  5426. IF L2BlockM = 0 THEN
  5427. MagicBlockR( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  5428. END;
  5429. DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  5430. END TestRMM;
  5431. PROCEDURE TestXMM*;
  5432. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  5433. iter, check: LONGINT;
  5434. BEGIN
  5435. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  5436. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  5437. In.LongInt( iter ); In.LongInt( check );
  5438. IF L2BlockM = 0 THEN
  5439. MagicBlockX( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  5440. END;
  5441. DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  5442. END TestXMM;
  5443. *)
  5444. (****** matrix multiplication using fast scalar product ******)
  5445. PROCEDURE MatMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5446. BEGIN
  5447. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  5448. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5449. END MatMulAXAXLoopA;
  5450. PROCEDURE MatMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5451. BEGIN
  5452. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  5453. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5454. END MatMulAXAXLoopSSE;
  5455. PROCEDURE MatMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5456. BEGIN
  5457. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  5458. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5459. END MatMulARARLoopA;
  5460. PROCEDURE MatMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5461. BEGIN
  5462. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  5463. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5464. END MatMulARARLoopSSE;
  5465. PROCEDURE MatMulIncAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5466. BEGIN
  5467. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5468. END MatMulIncAXAXLoopA;
  5469. PROCEDURE MatMulIncAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5470. BEGIN
  5471. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5472. END MatMulIncAXAXLoopSSE;
  5473. PROCEDURE MatMulIncARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5474. BEGIN
  5475. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5476. END MatMulIncARARLoopA;
  5477. PROCEDURE MatMulIncARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5478. BEGIN
  5479. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5480. END MatMulIncARARLoopSSE;
  5481. (****** matrix multiplication over rows with transposition of B *)
  5482. PROCEDURE MatMulHBlockR( MatrixA, MatrixB, MatrixC: ADDRESS;
  5483. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  5484. add: BOOLEAN );
  5485. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  5486. (*
  5487. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  5488. *)
  5489. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  5490. VAR i, j: SIZE; adrA, adrB, adrC: ADDRESS;
  5491. BEGIN
  5492. FOR i := fromA TO toA - 1 DO
  5493. adrA := MatrixA + i * Stride;
  5494. FOR j := fromB TO toB - 1 DO
  5495. adrB := MatrixB + j * Stride;
  5496. adrC := MatrixC + i * StrideC + j * IncC;
  5497. AlignedSPRSSE( adrA, adrB, adrC, Cols, add );
  5498. END;
  5499. END;
  5500. END Block;
  5501. BEGIN
  5502. IF cBlockSize = 0 THEN
  5503. BlockSize := L2CacheSize DIV Stride DIV 4;
  5504. ELSE BlockSize := cBlockSize;
  5505. END;
  5506. lastUsedBlockSize := BlockSize;
  5507. fromA := 0;
  5508. REPEAT
  5509. toA := fromA + BlockSize;
  5510. IF toA > RowsA THEN toA := RowsA END;
  5511. fromB := 0;
  5512. REPEAT
  5513. toB := fromB + BlockSize;
  5514. IF toB > RowsB THEN toB := RowsB END;
  5515. Block( fromA, toA, fromB, toB ); fromB := toB;
  5516. UNTIL toB = RowsB;
  5517. fromA := toA;
  5518. UNTIL toA = RowsA;
  5519. END MatMulHBlockR;
  5520. PROCEDURE MatMulHBlockX( MatrixA, MatrixB, MatrixC: ADDRESS;
  5521. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  5522. add: BOOLEAN );
  5523. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  5524. (*
  5525. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  5526. *)
  5527. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  5528. VAR adrA, adrB, adrC: ADDRESS; i, j: SIZE;
  5529. BEGIN
  5530. FOR i := fromA TO toA - 1 DO
  5531. adrA := MatrixA + i * Stride;
  5532. FOR j := fromB TO toB - 1 DO
  5533. adrB := MatrixB + j * Stride;
  5534. adrC := MatrixC + i * StrideC + j * IncC;
  5535. AlignedSPXSSE( adrA, adrB, adrC, Cols, add );
  5536. END;
  5537. END;
  5538. END Block;
  5539. BEGIN
  5540. IF cBlockSize = 0 THEN
  5541. BlockSize := L2CacheSize DIV Stride DIV 8;
  5542. ELSE BlockSize := cBlockSize;
  5543. END;
  5544. lastUsedBlockSize := BlockSize;
  5545. fromA := 0;
  5546. REPEAT
  5547. toA := fromA + BlockSize;
  5548. IF toA > RowsA THEN toA := RowsA END;
  5549. fromB := 0;
  5550. REPEAT
  5551. toB := fromB + BlockSize;
  5552. IF toB > RowsB THEN toB := RowsB END;
  5553. Block( fromA, toA, fromB, toB ); fromB := toB;
  5554. UNTIL toB = RowsB;
  5555. fromA := toA;
  5556. UNTIL toA = RowsA;
  5557. END MatMulHBlockX;
  5558. PROCEDURE CopyDataR( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  5559. VAR i: SIZE; t: HUGEINT;
  5560. BEGIN
  5561. Tic( t );
  5562. FOR i := 0 TO rows - 1 DO
  5563. Copy4( src, dest, incSrc, incDest, cols );
  5564. INC( src, strideSrc ); INC( dest, strideDest );
  5565. END;
  5566. Toc( t, copyT );
  5567. END CopyDataR;
  5568. PROCEDURE CopyDataX( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  5569. VAR i: SIZE; t: HUGEINT;
  5570. BEGIN
  5571. Tic( t );
  5572. FOR i := 0 TO rows - 1 DO
  5573. Copy8( src, dest, incSrc, incDest, cols );
  5574. INC( src, strideSrc ); INC( dest, strideDest );
  5575. END;
  5576. Toc( t, copyT );
  5577. END CopyDataX;
  5578. PROCEDURE MatMulARARTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5579. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5580. add: BOOLEAN ): BOOLEAN;
  5581. VAR stride: SIZE; adrB, adrC: ADDRESS;
  5582. proc: POINTER TO ARRAY OF MatMulHObjR;
  5583. from, to0, i: SIZE; cacheA, cacheB: Cache;
  5584. t: HUGEINT;
  5585. BEGIN
  5586. NEW(proc,nrProcesses);
  5587. ASSERT( ColsA = RowsB );
  5588. (* allocate 128 bit = 16 byte aligned matrix *)
  5589. stride := Align( ColsA * SIZEOF( REAL ), 16 );
  5590. IF (IncA # SIZEOF( REAL )) OR (StrideA # stride) OR
  5591. (matrixA MOD 16 # 0) THEN
  5592. cacheA := cachePool.Acquire( stride * RowsA );
  5593. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5594. SIZEOF( REAL ), stride, RowsA, ColsA ); (* copy to array *)
  5595. matrixA := cacheA.adr;
  5596. ELSE cacheA := NIL;
  5597. END;
  5598. IF (StrideB # SIZEOF( REAL )) OR (IncB # stride) OR
  5599. (matrixB MOD 16 # 0) THEN
  5600. cacheB := cachePool.Acquire( stride * ColsB );
  5601. CopyDataR( matrixB, cacheB.adr, StrideB, IncB,
  5602. SIZEOF( REAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  5603. matrixB := cacheB.adr;
  5604. ELSE cacheB := NIL;
  5605. END;
  5606. Tic( t );
  5607. (*! needs decision rule if to split by rows or columns *)
  5608. IF nrProcesses > 1 THEN
  5609. from := 0;
  5610. FOR i := 0 TO nrProcesses - 1 DO
  5611. (*
  5612. to := RowsA * (i + 1) DIV nrProcesses; adrA := matrixA + from * stride;
  5613. adrC := matrixC + from * StrideC;
  5614. *)
  5615. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5616. adrB := matrixB + from * stride;
  5617. adrC := matrixC + from * IncC;
  5618. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5619. RowsA, to0 - from, RowsB, add );
  5620. from := to0;
  5621. END;
  5622. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5623. ELSE
  5624. MatMulHBlockR( matrixA, matrixB, matrixC, stride, IncC,
  5625. StrideC, RowsA, ColsB, RowsB, add );
  5626. END;
  5627. Toc( t, compT ); cachePool.Release( cacheA );
  5628. cachePool.Release( cacheB ); RETURN TRUE;
  5629. END MatMulARARTransposed;
  5630. PROCEDURE MatMulAXAXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5631. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5632. add: BOOLEAN ): BOOLEAN;
  5633. VAR stride: SIZE; adrB, adrC: ADDRESS;
  5634. proc: POINTER TO ARRAY OF MatMulHObjX;
  5635. from, to0, i: SIZE; cacheA, cacheB: Cache;
  5636. t: HUGEINT;
  5637. BEGIN
  5638. NEW(proc,nrProcesses);
  5639. ASSERT( ColsA = RowsB );
  5640. stride := Align( ColsA * SIZEOF( LONGREAL ), 16 );
  5641. IF (IncA # SIZEOF( LONGREAL )) OR (StrideA # stride) OR
  5642. (matrixA MOD 16 # 0) THEN
  5643. cacheA := cachePool.Acquire( stride * RowsA );
  5644. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5645. SIZEOF( LONGREAL ), stride, RowsA, ColsA ); (* copy to array *)
  5646. matrixA := cacheA.adr;
  5647. ELSE cacheA := NIL;
  5648. END;
  5649. IF (StrideB # SIZEOF( LONGREAL )) OR (IncB # stride) OR
  5650. (matrixB MOD 16 # 0) THEN
  5651. cacheB := cachePool.Acquire( stride * ColsB );
  5652. CopyDataX( matrixB, cacheB.adr, StrideB, IncB,
  5653. SIZEOF( LONGREAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  5654. matrixB := cacheB.adr;
  5655. ELSE cacheB := NIL;
  5656. END;
  5657. Tic( t );
  5658. IF nrProcesses > 1 THEN
  5659. from := 0;
  5660. FOR i := 0 TO nrProcesses - 1 DO
  5661. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5662. adrB := matrixB + from * stride;
  5663. adrC := matrixC + from * IncC;
  5664. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5665. RowsA, to0 - from, RowsB, add );
  5666. from := to0;
  5667. END;
  5668. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5669. ELSE
  5670. MatMulHBlockX( matrixA, matrixB, matrixC, stride, IncC,
  5671. StrideC, RowsA, ColsB, RowsB, add );
  5672. END;
  5673. Toc( t, compT ); cachePool.Release( cacheA );
  5674. cachePool.Release( cacheB ); RETURN TRUE;
  5675. END MatMulAXAXTransposed;
  5676. (****** strided matrix multiplication with restrictions to increments ******)
  5677. PROCEDURE MatMulARARSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5678. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5679. add: BOOLEAN ): BOOLEAN;
  5680. VAR sum: REAL; CbFrom, i, j, k: SIZE; valA, valB: REAL;
  5681. adrA, adrB, adrC: ADDRESS;
  5682. cacheA, cacheB, cacheC: Cache;
  5683. matrixCO, StrideCO, IncCO: SIZE; t: HUGEINT;
  5684. (*VAR fromA, toA: LONGINT; *)
  5685. BEGIN
  5686. IF (IncA # SIZEOF( REAL )) THEN
  5687. cacheA :=
  5688. cachePool.Acquire( RowsA * ColsA * SIZEOF( REAL ) );
  5689. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5690. SIZEOF( REAL ), SIZEOF( REAL ) * ColsA, RowsA,
  5691. ColsA );
  5692. matrixA := cacheA.adr; IncA := SIZEOF( REAL );
  5693. StrideA := SIZEOF( REAL ) * ColsA;
  5694. END;
  5695. IF (IncB # SIZEOF( REAL )) THEN
  5696. cacheB :=
  5697. cachePool.Acquire( RowsB * ColsB * SIZEOF( REAL ) );
  5698. CopyDataR( matrixB, cacheB.adr, IncB, StrideB,
  5699. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsB,
  5700. ColsB );
  5701. matrixB := cacheB.adr; IncB := SIZEOF( REAL );
  5702. StrideB := SIZEOF( REAL ) * ColsB;
  5703. END;
  5704. IF (IncC # SIZEOF( REAL )) THEN
  5705. cacheC :=
  5706. cachePool.Acquire( RowsA * ColsB * SIZEOF( REAL ) );
  5707. CopyDataR( matrixC, cacheC.adr, IncC, StrideC,
  5708. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsA,
  5709. ColsB );
  5710. matrixCO := matrixC; StrideCO := StrideC;
  5711. IncCO := IncC; matrixC := cacheC.adr;
  5712. IncC := SIZEOF( REAL ); StrideC := SIZEOF( REAL ) * ColsB;
  5713. END;
  5714. Tic( t );
  5715. CbFrom := 0;
  5716. IF ColsB >= 24 THEN
  5717. SSEMul24BlockR( CbFrom, StrideA, StrideB, StrideC,
  5718. ColsA, RowsA, ColsB, RowsB, matrixA,
  5719. matrixB, matrixC, add );
  5720. END;
  5721. IF ColsB - CbFrom >= 16 THEN
  5722. SSEMul16BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5723. CbFrom, matrixA, matrixB, matrixC, add );
  5724. INC( CbFrom, 16 );
  5725. END;
  5726. IF ColsB - CbFrom >= 8 THEN
  5727. SSEMul8BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5728. CbFrom, matrixA, matrixB, matrixC, add );
  5729. INC( CbFrom, 8 );
  5730. END;
  5731. IF ColsB - CbFrom >= 4 THEN
  5732. SSEMul4BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5733. CbFrom, matrixA, matrixB, matrixC, add );
  5734. INC( CbFrom, 4 );
  5735. END;
  5736. IF ColsB - CbFrom > 0 THEN
  5737. (* do it in Oberon *)
  5738. FOR i := 0 TO RowsA - 1 DO
  5739. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5740. FOR j := CbFrom TO ColsB - 1 DO
  5741. adrA := matrixA + i * StrideA;
  5742. adrB := matrixB + j * IncB;
  5743. IF add THEN SYSTEM.GET( adrC, sum )
  5744. ELSE sum := 0
  5745. END;
  5746. FOR k := 0 TO RowsB - 1 DO
  5747. SYSTEM.GET( adrA, valA );
  5748. SYSTEM.GET( adrB, valB );
  5749. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5750. INC( adrA, IncA ); INC( adrB, StrideB );
  5751. END;
  5752. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5753. (* C[i, j] := sum; *)
  5754. END;
  5755. END;
  5756. END;
  5757. Toc( t, compT );
  5758. IF cacheC # NIL THEN
  5759. CopyDataR( matrixC, matrixCO, IncC, StrideC, IncCO,
  5760. StrideCO, RowsA, ColsB );
  5761. END;
  5762. cachePool.Release( cacheA );
  5763. cachePool.Release( cacheB );
  5764. cachePool.Release( cacheC );
  5765. RETURN TRUE;
  5766. END MatMulARARSSEStride;
  5767. PROCEDURE MatMulAXAXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5768. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5769. add: BOOLEAN ): BOOLEAN;
  5770. VAR sum: LONGREAL; CbFrom, i, j, k: SIZE;
  5771. valA, valB: LONGREAL; adrA, adrB, adrC: ADDRESS;
  5772. cacheA, cacheB, cacheC: Cache;
  5773. matrixCO, StrideCO, IncCO:SIZE; t: HUGEINT;
  5774. BEGIN
  5775. IF (IncA # SIZEOF( LONGREAL )) THEN
  5776. cacheA :=
  5777. cachePool.Acquire( RowsA * ColsA * SIZEOF( LONGREAL ) );
  5778. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5779. SIZEOF( LONGREAL ),
  5780. SIZEOF( LONGREAL ) * ColsA, RowsA, ColsA );
  5781. matrixA := cacheA.adr;
  5782. StrideA := SIZEOF( LONGREAL ) * ColsA;
  5783. IncA := SIZEOF( LONGREAL );
  5784. END;
  5785. IF (IncB # SIZEOF( LONGREAL )) THEN
  5786. cacheB :=
  5787. cachePool.Acquire( RowsB * ColsB * SIZEOF( LONGREAL ) );
  5788. CopyDataX( matrixB, cacheB.adr, IncB, StrideB,
  5789. SIZEOF( LONGREAL ),
  5790. SIZEOF( LONGREAL ) * ColsB, RowsB, ColsB );
  5791. matrixB := cacheB.adr;
  5792. StrideB := SIZEOF( LONGREAL ) * ColsB;
  5793. IncB := SIZEOF( LONGREAL );
  5794. END;
  5795. IF (IncC # SIZEOF( LONGREAL )) THEN
  5796. cacheC :=
  5797. cachePool.Acquire( RowsA * ColsB * SIZEOF( LONGREAL ) );
  5798. CopyDataX( matrixC, cacheC.adr, IncC, StrideC,
  5799. SIZEOF( LONGREAL ),
  5800. SIZEOF( LONGREAL ) * ColsB, RowsA, ColsB );
  5801. matrixCO := matrixC; StrideCO := StrideC;
  5802. IncCO := IncC; StrideC := SIZEOF( LONGREAL ) * ColsB;
  5803. IncC := SIZEOF( LONGREAL ); matrixC := cacheC.adr;
  5804. END;
  5805. Tic( t );
  5806. CbFrom := 0;
  5807. IF ColsB >= 12 THEN
  5808. SSEMul12BlockX( CbFrom, StrideA, StrideB, StrideC,
  5809. ColsA, RowsA, ColsB, RowsB, matrixA,
  5810. matrixB, matrixC, add );
  5811. END;
  5812. IF ColsB - CbFrom >= 8 THEN
  5813. SSEMul8BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5814. CbFrom, matrixA, matrixB, matrixC, add );
  5815. INC( CbFrom, 8 );
  5816. END;
  5817. IF ColsB - CbFrom >= 4 THEN
  5818. SSEMul4BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5819. CbFrom, matrixA, matrixB, matrixC, add );
  5820. INC( CbFrom, 4 );
  5821. END;
  5822. IF ColsB - CbFrom >= 2 THEN
  5823. SSEMul2BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5824. CbFrom, matrixA, matrixB, matrixC, add );
  5825. INC( CbFrom, 2 );
  5826. END;
  5827. IF ColsB - CbFrom > 0 THEN
  5828. (* do it in Oberon *)
  5829. FOR i := 0 TO RowsA - 1 DO
  5830. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5831. FOR j := CbFrom TO ColsB - 1 DO
  5832. adrA := matrixA + i * StrideA;
  5833. adrB := matrixB + j * IncB;
  5834. IF add THEN SYSTEM.GET( adrC, sum )
  5835. ELSE sum := 0
  5836. END;
  5837. FOR k := 0 TO RowsB - 1 DO
  5838. SYSTEM.GET( adrA, valA );
  5839. SYSTEM.GET( adrB, valB );
  5840. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5841. INC( adrA, IncA ); INC( adrB, StrideB );
  5842. END;
  5843. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5844. (* C[i, j] := sum; *)
  5845. END;
  5846. END;
  5847. END;
  5848. Toc( t, compT );
  5849. IF cacheC # NIL THEN
  5850. CopyDataX( matrixC, matrixCO, IncC, StrideC, IncCO,
  5851. StrideCO, RowsA, ColsB );
  5852. END;
  5853. cachePool.Release( cacheA );
  5854. cachePool.Release( cacheB );
  5855. cachePool.Release( cacheC );
  5856. RETURN TRUE;
  5857. END MatMulAXAXSSEStride;
  5858. (****** naiive Oberon matrix multiplication ******)
  5859. PROCEDURE MatMulARARNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5860. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5861. add: BOOLEAN );
  5862. (*
  5863. A is M x K matrix, M=rows (A); K=cols(A);
  5864. B is K x N matrix; K=rows(B); N = cols(B);
  5865. C is M x N matrix; M=rows(C); N=cols(C);
  5866. *)
  5867. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5868. val1, val2, sum: REAL; t: HUGEINT;
  5869. BEGIN
  5870. Tic( t );
  5871. FOR i := 1 TO M DO
  5872. adrC := matrixC; adrB := matrixB;
  5873. FOR j := 1 TO N DO
  5874. adrA := matrixA; innerB := adrB;
  5875. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5876. FOR k := 1 TO K DO
  5877. SYSTEM.GET( adrA, val1 );
  5878. SYSTEM.GET( innerB, val2 );
  5879. sum := sum + val1 * val2; INC( adrA, IncA );
  5880. INC( innerB, StrideB );
  5881. END;
  5882. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5883. INC( adrC, IncC );
  5884. END;
  5885. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5886. END;
  5887. Toc( t, compT );
  5888. END MatMulARARNaiive;
  5889. PROCEDURE MatMulAXAXNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5890. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5891. add: BOOLEAN );
  5892. (*
  5893. A is M x K matrix, M=rows (A); K=cols(A);
  5894. B is K x N matrix; K=rows(B); N = cols(B);
  5895. C is M x N matrix; M=rows(C); N=cols(C);
  5896. *)
  5897. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5898. val1, val2, sum: LONGREAL; t: HUGEINT;
  5899. BEGIN
  5900. Tic( t );
  5901. FOR i := 1 TO M DO
  5902. adrC := matrixC; adrB := matrixB;
  5903. FOR j := 1 TO N DO
  5904. adrA := matrixA; innerB := adrB;
  5905. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5906. FOR k := 1 TO K DO
  5907. SYSTEM.GET( adrA, val1 );
  5908. SYSTEM.GET( innerB, val2 );
  5909. sum := sum + val1 * val2; INC( adrA, IncA );
  5910. INC( innerB, StrideB );
  5911. END;
  5912. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5913. INC( adrC, IncC );
  5914. END;
  5915. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5916. END;
  5917. Toc( t, compT );
  5918. END MatMulAXAXNaiive;
  5919. (*
  5920. PROCEDURE Toggle( VAR A, B: LONGINT );
  5921. VAR temp: LONGINT;
  5922. BEGIN
  5923. temp := A; A := B; B := temp;
  5924. END Toggle;
  5925. PROCEDURE Transpose( VAR matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: LONGINT );
  5926. (*
  5927. prepare computation of C=A*B via C = (B` * A`)`
  5928. *)
  5929. BEGIN
  5930. Toggle( matrixA, matrixB ); Toggle( IncA, StrideB ); Toggle( StrideA, IncB );
  5931. Toggle( IncC, StrideC ); Toggle( M, N );
  5932. END Transpose;
  5933. *)
  5934. (*
  5935. *)
  5936. PROCEDURE BestMethod( M, N, K: SIZE ): LONGINT;
  5937. BEGIN
  5938. IF M = 1 THEN
  5939. IF N < 32 THEN RETURN cMatMulScalarProduct
  5940. ELSIF N < 256 THEN
  5941. IF K < 256 THEN RETURN cMatMulScalarProduct
  5942. ELSE RETURN cMatMulStride
  5943. END;
  5944. ELSE RETURN cMatMulStride
  5945. END;
  5946. ELSIF N = 1 THEN
  5947. IF (M > 1024) & (K > 1024) THEN
  5948. RETURN cMatMulTransposed
  5949. ELSE RETURN cMatMulScalarProduct
  5950. END;
  5951. ELSIF K = 1 THEN
  5952. IF N < 32 THEN
  5953. IF M < 256 THEN RETURN cMatMulNaive
  5954. ELSE RETURN cMatMulStride
  5955. END;
  5956. ELSIF N < 256 THEN
  5957. IF M < 32 THEN RETURN cMatMulNaive
  5958. ELSE RETURN cMatMulStride
  5959. END;
  5960. ELSE RETURN cMatMulStride
  5961. END;
  5962. ELSIF M < 32 THEN
  5963. IF N < 32 THEN RETURN cMatMulScalarProduct
  5964. ELSIF N < 256 THEN
  5965. IF K < 32 THEN RETURN cMatMulScalarProduct
  5966. ELSE RETURN cMatMulStride
  5967. END;
  5968. ELSE RETURN cMatMulStride
  5969. END;
  5970. ELSIF M < 256 THEN
  5971. IF N < 32 THEN
  5972. IF K < 32 THEN RETURN cMatMulScalarProduct
  5973. ELSE RETURN cMatMulStride
  5974. END;
  5975. ELSE
  5976. IF K < 256 THEN RETURN cMatMulStride
  5977. ELSE RETURN cMatMulBlocked
  5978. END;
  5979. END;
  5980. ELSE
  5981. IF N < 32 THEN RETURN cMatMulStride ELSE
  5982. IF K < 256 THEN RETURN cMatMulStride
  5983. ELSE RETURN cMatMulBlocked
  5984. END;
  5985. END;
  5986. END;
  5987. RETURN cMatMulStride;
  5988. END BestMethod;
  5989. (*
  5990. (N) (K) (N)
  5991. CCCCCC AAAAA BBBBB
  5992. CCCCCC AAAAA BBBBB
  5993. (M) CCCCCC = (M) AAAAA * (K) BBBBB
  5994. CCCCCC AAAAA BBBBB
  5995. CCCCCC AAAAA BBBBB
  5996. *)
  5997. PROCEDURE MatMulR( matrixA, matrixB, matrixC: ADDRESS;
  5998. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5999. (*! heuristics for choice of different methods needs improvement *)
  6000. (*! transpose if superior*)
  6001. (*! provide special variant for small [up to 4x4] matrices *)
  6002. VAR M, N, K: SIZE;
  6003. BEGIN
  6004. ASSERT( ColsA = RowsB );
  6005. M := RowsA; N := ColsB; K := ColsA;
  6006. CASE BestMethod( M, N, K ) OF
  6007. | cMatMulScalarProduct:
  6008. RETURN FALSE;
  6009. | cMatMulNaive:
  6010. RETURN MatMulRNaive( matrixA, matrixB, matrixC, IncA,
  6011. StrideA, IncB, StrideB, IncC,
  6012. StrideC, RowsA, ColsA, RowsB,
  6013. ColsB );
  6014. | cMatMulTransposed:
  6015. RETURN MatMulARARTransposed( matrixA, matrixB,
  6016. matrixC, IncA,
  6017. StrideA, IncB,
  6018. StrideB, IncC,
  6019. StrideC, RowsA,
  6020. ColsA, RowsB,
  6021. ColsB, FALSE );
  6022. | cMatMulStride:
  6023. RETURN MatMulARARSSEStride( matrixA, matrixB,
  6024. matrixC, IncA, StrideA,
  6025. IncB, StrideB, IncC,
  6026. StrideC, RowsA,
  6027. ColsA, RowsB, ColsB,
  6028. FALSE );
  6029. | cMatMulBlocked:
  6030. RETURN MatMulARARBlocked( matrixA, matrixB,
  6031. matrixC, IncA, StrideA,
  6032. IncB, StrideB, IncC,
  6033. StrideC, RowsA, ColsA,
  6034. RowsB, ColsB, FALSE );
  6035. ELSE
  6036. RETURN FALSE (* use scalar product for each row and column *)
  6037. END;
  6038. END MatMulR;
  6039. PROCEDURE MatMulX( matrixA, matrixB, matrixC: ADDRESS;
  6040. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6041. VAR M, N, K: SIZE;
  6042. BEGIN
  6043. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  6044. K := ColsA;
  6045. (*
  6046. KernelLog.String("MatMulX, M,N,K = "); KernelLog.Int(M,10); KernelLog.Int(N,10); KernelLog.Int(K,10); KernelLog.Ln;
  6047. KernelLog.String("Method= "); KernelLog.Int( BestMethod(M,N,K),10); KernelLog.Ln;
  6048. *)
  6049. CASE BestMethod( M, N, K ) OF
  6050. | cMatMulScalarProduct:
  6051. RETURN FALSE;
  6052. | cMatMulNaive:
  6053. RETURN MatMulXNaive( matrixA, matrixB, matrixC, IncA,
  6054. StrideA, IncB, StrideB, IncC,
  6055. StrideC, RowsA, ColsA, RowsB,
  6056. ColsB );
  6057. | cMatMulTransposed:
  6058. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  6059. matrixC, IncA,
  6060. StrideA, IncB, StrideB,
  6061. IncC, StrideC, RowsA,
  6062. ColsA, RowsB, ColsB,
  6063. FALSE );
  6064. | cMatMulStride:
  6065. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  6066. matrixC, IncA, StrideA,
  6067. IncB, StrideB, IncC,
  6068. StrideC, RowsA, ColsA,
  6069. RowsB, ColsB,
  6070. FALSE );
  6071. | cMatMulBlocked:
  6072. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  6073. matrixC, IncA, StrideA,
  6074. IncB, StrideB, IncC,
  6075. StrideC, RowsA, ColsA,
  6076. RowsB, ColsB, FALSE );
  6077. ELSE
  6078. RETURN FALSE (* use scalar product for each row and column *)
  6079. END;
  6080. END MatMulX;
  6081. PROCEDURE MatMulIncR( matrixA, matrixB, matrixC: ADDRESS;
  6082. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6083. (*! heuristics for choice of different methods needs improvement *)
  6084. (*! transpose if superior*)
  6085. (*! provide special variant for small [up to 4x4] matrices *)
  6086. VAR M, N, K: SIZE;
  6087. BEGIN
  6088. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  6089. K := ColsA;
  6090. CASE BestMethod( M, N, K ) OF
  6091. | cMatMulScalarProduct:
  6092. RETURN FALSE;
  6093. | cMatMulNaive:
  6094. RETURN MatMulIncRNaive( matrixA, matrixB, matrixC,
  6095. IncA, StrideA, IncB, StrideB,
  6096. IncC, StrideC, RowsA, ColsA,
  6097. RowsB, ColsB );
  6098. | cMatMulTransposed:
  6099. RETURN MatMulARARTransposed( matrixA, matrixB,
  6100. matrixC, IncA,
  6101. StrideA, IncB,
  6102. StrideB, IncC,
  6103. StrideC, RowsA,
  6104. ColsA, RowsB,
  6105. ColsB, TRUE );
  6106. | cMatMulStride:
  6107. RETURN MatMulARARSSEStride( matrixA, matrixB,
  6108. matrixC, IncA, StrideA,
  6109. IncB, StrideB, IncC,
  6110. StrideC, RowsA,
  6111. ColsA, RowsB, ColsB,
  6112. TRUE );
  6113. | cMatMulBlocked:
  6114. RETURN MatMulARARBlocked( matrixA, matrixB,
  6115. matrixC, IncA, StrideA,
  6116. IncB, StrideB, IncC,
  6117. StrideC, RowsA, ColsA,
  6118. RowsB, ColsB, TRUE );
  6119. ELSE
  6120. RETURN FALSE (* use scalar product for each row and column *)
  6121. END;
  6122. END MatMulIncR;
  6123. PROCEDURE MatMulIncX( matrixA, matrixB, matrixC: ADDRESS;
  6124. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6125. VAR M, N, K: SIZE;
  6126. BEGIN
  6127. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  6128. K := ColsA;
  6129. CASE BestMethod( M, N, K ) OF
  6130. | cMatMulScalarProduct:
  6131. RETURN FALSE;
  6132. | cMatMulNaive:
  6133. RETURN MatMulIncXNaive( matrixA, matrixB, matrixC,
  6134. IncA, StrideA, IncB, StrideB,
  6135. IncC, StrideC, RowsA, ColsA,
  6136. RowsB, ColsB );
  6137. | cMatMulTransposed:
  6138. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  6139. matrixC, IncA,
  6140. StrideA, IncB, StrideB,
  6141. IncC, StrideC, RowsA,
  6142. ColsA, RowsB, ColsB,
  6143. TRUE );
  6144. | cMatMulStride:
  6145. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  6146. matrixC, IncA, StrideA,
  6147. IncB, StrideB, IncC,
  6148. StrideC, RowsA, ColsA,
  6149. RowsB, ColsB, TRUE );
  6150. | cMatMulBlocked:
  6151. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  6152. matrixC, IncA, StrideA,
  6153. IncB, StrideB, IncC,
  6154. StrideC, RowsA, ColsA,
  6155. RowsB, ColsB, TRUE );
  6156. ELSE
  6157. RETURN FALSE (* use scalar product for each row and column *)
  6158. END;
  6159. END MatMulIncX;
  6160. PROCEDURE MatMulARARBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6161. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  6162. add: BOOLEAN ): BOOLEAN;
  6163. VAR M, N, K, L2M, L2N, L2K: SIZE;
  6164. BEGIN
  6165. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  6166. K := ColsA; MagicBlockR( M, N, K, L2M, L2N, L2K );
  6167. (*
  6168. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  6169. IncC, StrideC, RowsA, ColsB, ColsA );
  6170. *)
  6171. MultiplyR( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  6172. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  6173. StrideC, add );
  6174. RETURN TRUE;
  6175. END MatMulARARBlocked;
  6176. PROCEDURE MatMulAXAXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6177. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  6178. add: BOOLEAN ): BOOLEAN;
  6179. VAR M, N, K, L2M, L2N, L2K: SIZE;
  6180. BEGIN
  6181. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  6182. K := ColsA; MagicBlockX( M, N, K, L2M, L2N, L2K );
  6183. (*
  6184. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  6185. IncC, StrideC, RowsA, ColsB, ColsA );
  6186. *)
  6187. MultiplyX( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  6188. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  6189. StrideC, add );
  6190. RETURN TRUE;
  6191. END MatMulAXAXBlocked;
  6192. PROCEDURE MatMulRNaive( matrixA, matrixB, matrixC: ADDRESS;
  6193. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6194. BEGIN
  6195. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  6196. IncB, StrideB, IncC, StrideC, RowsA,
  6197. ColsB, ColsA, FALSE );
  6198. RETURN TRUE;
  6199. END MatMulRNaive;
  6200. PROCEDURE MatMulXNaive( matrixA, matrixB, matrixC: ADDRESS;
  6201. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6202. BEGIN
  6203. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  6204. IncB, StrideB, IncC, StrideC, RowsA,
  6205. ColsB, ColsA, FALSE );
  6206. RETURN TRUE;
  6207. END MatMulXNaive;
  6208. PROCEDURE MatMulIncRNaive( matrixA, matrixB, matrixC: ADDRESS;
  6209. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6210. BEGIN
  6211. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  6212. IncB, StrideB, IncC, StrideC, RowsA,
  6213. ColsB, ColsA, TRUE );
  6214. RETURN TRUE;
  6215. END MatMulIncRNaive;
  6216. PROCEDURE MatMulIncXNaive( matrixA, matrixB, matrixC: ADDRESS;
  6217. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6218. BEGIN
  6219. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  6220. IncB, StrideB, IncC, StrideC, RowsA,
  6221. ColsB, ColsA, TRUE );
  6222. RETURN TRUE;
  6223. END MatMulIncXNaive;
  6224. PROCEDURE MatMulXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  6225. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6226. BEGIN
  6227. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  6228. IncA, StrideA, IncB,
  6229. StrideB, IncC, StrideC,
  6230. RowsA, ColsA, RowsB,
  6231. ColsB, FALSE );
  6232. END MatMulXTransposed;
  6233. PROCEDURE MatMulIncXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  6234. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6235. BEGIN
  6236. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  6237. IncA, StrideA, IncB,
  6238. StrideB, IncC, StrideC,
  6239. RowsA, ColsA, RowsB,
  6240. ColsB, TRUE )
  6241. END MatMulIncXTransposed;
  6242. PROCEDURE MatMulRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  6243. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6244. BEGIN
  6245. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  6246. IncA, StrideA, IncB,
  6247. StrideB, IncC, StrideC,
  6248. RowsA, ColsA, RowsB,
  6249. ColsB, FALSE );
  6250. END MatMulRTransposed;
  6251. PROCEDURE MatMulIncRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  6252. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6253. BEGIN
  6254. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  6255. IncA, StrideA, IncB,
  6256. StrideB, IncC, StrideC,
  6257. RowsA, ColsA, RowsB,
  6258. ColsB, TRUE )
  6259. END MatMulIncRTransposed;
  6260. PROCEDURE MatMulXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  6261. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6262. BEGIN
  6263. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  6264. IncA, StrideA, IncB, StrideB,
  6265. IncC, StrideC, RowsA,
  6266. ColsA, RowsB, ColsB,
  6267. FALSE );
  6268. END MatMulXSSEStride;
  6269. PROCEDURE MatMulIncXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  6270. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6271. BEGIN
  6272. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  6273. IncA, StrideA, IncB, StrideB,
  6274. IncC, StrideC, RowsA,
  6275. ColsA, RowsB, ColsB,
  6276. TRUE );
  6277. END MatMulIncXSSEStride;
  6278. PROCEDURE MatMulRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  6279. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6280. BEGIN
  6281. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  6282. IncA, StrideA, IncB, StrideB,
  6283. IncC, StrideC, RowsA,
  6284. ColsA, RowsB, ColsB,
  6285. FALSE );
  6286. END MatMulRSSEStride;
  6287. PROCEDURE MatMulIncRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  6288. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6289. BEGIN
  6290. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  6291. IncA, StrideA, IncB, StrideB,
  6292. IncC, StrideC, RowsA,
  6293. ColsA, RowsB, ColsB,
  6294. TRUE )
  6295. END MatMulIncRSSEStride;
  6296. PROCEDURE MatMulRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6297. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6298. BEGIN
  6299. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  6300. IncA, StrideA, IncB, StrideB,
  6301. IncC, StrideC, RowsA, ColsA,
  6302. RowsB, ColsB, FALSE )
  6303. END MatMulRBlocked;
  6304. PROCEDURE MatMulIncRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6305. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6306. BEGIN
  6307. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  6308. IncA, StrideA, IncB, StrideB,
  6309. IncC, StrideC, RowsA, ColsA,
  6310. RowsB, ColsB, TRUE )
  6311. END MatMulIncRBlocked;
  6312. PROCEDURE MatMulXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6313. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6314. BEGIN
  6315. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  6316. IncA, StrideA, IncB, StrideB,
  6317. IncC, StrideC, RowsA, ColsA,
  6318. RowsB, ColsB, FALSE )
  6319. END MatMulXBlocked;
  6320. PROCEDURE MatMulIncXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6321. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6322. BEGIN
  6323. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  6324. IncA, StrideA, IncB, StrideB,
  6325. IncC, StrideC, RowsA, ColsA,
  6326. RowsB, ColsB, TRUE )
  6327. END MatMulIncXBlocked;
  6328. PROCEDURE SetMatMulMethod*( i: LONGINT );
  6329. BEGIN
  6330. KernelLog.String("ArrayBaseOptimized, method = ");
  6331. IF i = cMatMulDynamic THEN
  6332. KernelLog.String("dynamic.");
  6333. ArrayBase.matMulIncR := MatMulIncR;
  6334. ArrayBase.matMulIncX := MatMulIncX;
  6335. ArrayBase.matMulR := MatMulR;
  6336. ArrayBase.matMulX := MatMulX;
  6337. ELSIF i = cMatMulScalarProduct THEN
  6338. KernelLog.String("scalarproduct.");
  6339. ArrayBase.matMulIncR := NIL;
  6340. ArrayBase.matMulIncX := NIL;
  6341. ArrayBase.matMulR := NIL; ArrayBase.matMulX := NIL;
  6342. ELSIF i = cMatMulNaive THEN
  6343. KernelLog.String("naiive.");
  6344. ArrayBase.matMulR := MatMulRNaive;
  6345. ArrayBase.matMulX := MatMulXNaive;
  6346. ArrayBase.matMulIncR := MatMulIncRNaive;
  6347. ArrayBase.matMulIncX := MatMulIncXNaive;
  6348. ELSIF i = cMatMulTransposed THEN
  6349. KernelLog.String("transposed.");
  6350. ArrayBase.matMulR := MatMulRTransposed;
  6351. ArrayBase.matMulX := MatMulXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  6352. ArrayBase.matMulIncR := MatMulIncRTransposed;
  6353. ArrayBase.matMulIncX := MatMulIncXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  6354. ELSIF i = cMatMulStride THEN
  6355. KernelLog.String("stride.");
  6356. ArrayBase.matMulR := MatMulRSSEStride;
  6357. ArrayBase.matMulX := MatMulXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  6358. ArrayBase.matMulIncR := MatMulIncRSSEStride;
  6359. ArrayBase.matMulIncX := MatMulIncXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  6360. ELSIF i = cMatMulBlocked THEN
  6361. KernelLog.String("blocked.");
  6362. ArrayBase.matMulR := MatMulRBlocked;
  6363. ArrayBase.matMulX := MatMulXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  6364. ArrayBase.matMulIncR := MatMulIncRBlocked;
  6365. ArrayBase.matMulIncX := MatMulIncXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  6366. END;
  6367. KernelLog.Ln;
  6368. END SetMatMulMethod;
  6369. (* optimizations for small arrays (Alexey Morozov) *)
  6370. (* assumes that all arrays do not overlap *)
  6371. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  6372. PROCEDURE MatMulR2x2(dadr, ladr, radr: ADDRESS);
  6373. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  6374. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  6375. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  6376. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  6377. MOVUPS XMM0, [RAX] ; [a00,a01,a10,a11]
  6378. MOVUPS XMM1, [RBX] ; [b00,b01,b10,b11]
  6379. MOVAPS XMM2, XMM1
  6380. SHUFPS XMM2, XMM1, 204 ; XMM2 := [b00,b11,b00,b11]
  6381. MULPS XMM2, XMM0
  6382. SHUFPS XMM0, XMM0, 177 ; XMM0 := [a01,a00,a11,a10]
  6383. SHUFPS XMM1, XMM1, 102 ; XMM1 := [b10,b01,b10,b01]
  6384. MULPS XMM1, XMM0
  6385. ADDPS XMM1, XMM2
  6386. MOVUPS [RCX], XMM1
  6387. END MatMulR2x2;
  6388. (* based on weighted sum of rows (Alexey Morozov) *)
  6389. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  6390. PROCEDURE MatMulR3x3(dadr, ladr, radr: ADDRESS);
  6391. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  6392. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  6393. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  6394. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  6395. MOVUPS XMM0, [RBX] ; XMM0 := [b00,b01,b02,-]
  6396. MOVUPS XMM1, [RBX+12] ; XMM1 := [b10,b11,b12,-]
  6397. ; last element is out of range, is it still OK?
  6398. MOVUPS XMM2, [RBX+24] ; XMM2 := [b20,b21,b22,-]
  6399. ;MOVLPS XMM2, [RBX+24]
  6400. ;MOVSS XMM3, [RBX+32]
  6401. ;MOVLHPS XMM2, XMM3
  6402. MOVSS XMM3, [RAX]
  6403. SHUFPS XMM3, XMM3, 0; XMM3 := [a00,a00,a00,-]
  6404. MOVAPS XMM4, XMM0
  6405. MULPS XMM4, XMM3
  6406. MOVSS XMM3, [RAX+4]
  6407. SHUFPS XMM3, XMM3, 0; XMM3 := [a01,a01,a01,-]
  6408. MULPS XMM3, XMM1
  6409. ADDPS XMM4, XMM3
  6410. MOVSS XMM3, [RAX+8]
  6411. SHUFPS XMM3, XMM3, 0; XMM3 := [a02,a02,a02,-]
  6412. MULPS XMM3, XMM2
  6413. ADDPS XMM4, XMM3
  6414. MOVUPS [RCX], XMM4
  6415. ;***************************************************;
  6416. MOVSS XMM3, [RAX+12]
  6417. SHUFPS XMM3, XMM3, 0; XMM3 := [a10,a10,a10,-]
  6418. MOVAPS XMM4, XMM0
  6419. MULPS XMM4, XMM3
  6420. MOVSS XMM3, [RAX+16]
  6421. SHUFPS XMM3, XMM3, 0; XMM3 := [a11,a11,a11,-]
  6422. MULPS XMM3, XMM1
  6423. ADDPS XMM4, XMM3
  6424. MOVSS XMM3, [RAX+20]
  6425. SHUFPS XMM3, XMM3, 0; XMM3 := [a12,a12,a12,-]
  6426. MULPS XMM3, XMM2
  6427. ADDPS XMM4, XMM3
  6428. MOVUPS [RCX+12], XMM4
  6429. ;***************************************************;
  6430. MOVSS XMM3, [RAX+24]
  6431. SHUFPS XMM3, XMM3, 0; XMM3 := [a20,a20,a20,-]
  6432. MOVAPS XMM4, XMM0
  6433. MULPS XMM4, XMM3
  6434. MOVSS XMM3, [RAX+28]
  6435. SHUFPS XMM3, XMM3, 0; XMM3 := [a21,a21,a21,-]
  6436. MULPS XMM3, XMM1
  6437. ADDPS XMM4, XMM3
  6438. MOVSS XMM3, [RAX+32]
  6439. SHUFPS XMM3, XMM3, 0; XMM3 := [a22,a22,a22,-]
  6440. MULPS XMM3, XMM2
  6441. ADDPS XMM4, XMM3
  6442. ;MOVUPS [RCX+24], XMM4
  6443. MOVLPS [RCX+24], XMM4
  6444. MOVHLPS XMM4, XMM4
  6445. MOVSS [RCX+32], XMM4
  6446. END MatMulR3x3;
  6447. (* based on Strassen algorithm (Alexey Morozov) *)
  6448. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  6449. PROCEDURE MatMulR4x4(dadr, ladr, radr: ADDRESS);
  6450. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  6451. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  6452. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  6453. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  6454. ; load A00
  6455. MOVLPS XMM0, [RAX] ; XMM0 := [a00,a01,-,-]
  6456. MOVHPS XMM0, [RAX+16] ; XMM0 := [a00,a01,a10,a11]
  6457. ; load A01
  6458. MOVLPS XMM1, [RAX+8] ; XMM1 := [a02,a03,-,-]
  6459. MOVHPS XMM1, [RAX+24] ; XMM1 := [a02,a03,a12,a13]
  6460. ; load B00
  6461. MOVLPS XMM2, [RBX] ; XMM2 := [b00,b01,-,-]
  6462. MOVHPS XMM2, [RBX+16] ; XMM2 := [b00,b01,b10,b11]
  6463. ; load B01
  6464. MOVLPS XMM3, [RBX+8] ; XMM3 := [a02,a03,-,-]
  6465. MOVHPS XMM3, [RBX+24] ; XMM3 := [a02,a03,a12,a13]
  6466. ; load B10
  6467. MOVLPS XMM4, [RBX+32] ; XMM4 := [b20,b21,-,-]
  6468. MOVHPS XMM4, [RBX+48] ; XMM4 := [b20,b21,b30,b31]
  6469. ; load B11
  6470. MOVLPS XMM5, [RBX+40] ; XMM5 := [b22,b23,-,-]
  6471. MOVHPS XMM5, [RBX+56] ; XMM5 := [b22,b23,b32,b33]
  6472. ;****************************************************;
  6473. ; multiply A00(D)*B00(E) (use MatMulR2x2 code)
  6474. MOVAPS XMM6, XMM2
  6475. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6476. MULPS XMM6, XMM0
  6477. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6478. MOVAPS XMM7, XMM2
  6479. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6480. MULPS XMM7, XMM0
  6481. ADDPS XMM7, XMM6
  6482. ; multiply A01(D)*B10(E)
  6483. MOVAPS XMM0, XMM4
  6484. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6485. MULPS XMM0, XMM1
  6486. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6487. MOVAPS XMM6, XMM4
  6488. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6489. MULPS XMM6, XMM1
  6490. ADDPS XMM6, XMM0
  6491. ADDPS XMM7, XMM6
  6492. MOVLPS [RCX], XMM7
  6493. MOVHPS [RCX+16], XMM7
  6494. ;****************************************************;
  6495. ; load A00
  6496. MOVLPS XMM0, [RAX] ; XMM0 := [a00,a01,-,-]
  6497. MOVHPS XMM0, [RAX+16] ; XMM0 := [a00,a01,a10,a11]
  6498. ; load A01
  6499. MOVLPS XMM1, [RAX+8] ; XMM1 := [a02,a03,-,-]
  6500. MOVHPS XMM1, [RAX+24] ; XMM1 := [a02,a03,a12,a13]
  6501. ; multiply A00(D)*B01(E) (use MatMulR2x2 code)
  6502. MOVAPS XMM6, XMM3
  6503. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6504. MULPS XMM6, XMM0
  6505. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6506. MOVAPS XMM7, XMM3
  6507. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6508. MULPS XMM7, XMM0
  6509. ADDPS XMM7, XMM6
  6510. ; multiply A01(D)*B11(E)
  6511. MOVAPS XMM0, XMM5
  6512. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6513. MULPS XMM0, XMM1
  6514. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6515. MOVAPS XMM6, XMM5
  6516. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6517. MULPS XMM6, XMM1
  6518. ADDPS XMM6, XMM0
  6519. ADDPS XMM7, XMM6
  6520. MOVLPS [RCX+8], XMM7
  6521. MOVHPS [RCX+24], XMM7
  6522. ;****************************************************;
  6523. ; load A10
  6524. MOVLPS XMM0, [RAX+32] ; XMM0 := [a20,a21,-,-]
  6525. MOVHPS XMM0, [RAX+48] ; XMM0 := [a20,a21,a30,a31]
  6526. ; load A11
  6527. MOVLPS XMM1, [RAX+40] ; XMM1 := [a22,a23,-,-]
  6528. MOVHPS XMM1, [RAX+56] ; XMM1 := [a22,a23,a32,a33]
  6529. ; multiply A10(D)*B00(E) (use MatMulR2x2 code)
  6530. MOVAPS XMM6, XMM2
  6531. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6532. MULPS XMM6, XMM0
  6533. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6534. MOVAPS XMM7, XMM2
  6535. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6536. MULPS XMM7, XMM0
  6537. ADDPS XMM7, XMM6
  6538. ; multiply A11(D)*B10(E)
  6539. MOVAPS XMM0, XMM4
  6540. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6541. MULPS XMM0, XMM1
  6542. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6543. MOVAPS XMM6, XMM4
  6544. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6545. MULPS XMM6, XMM1
  6546. ADDPS XMM6, XMM0
  6547. ADDPS XMM7, XMM6
  6548. MOVLPS [RCX+32], XMM7
  6549. MOVHPS [RCX+48], XMM7
  6550. ;****************************************************;
  6551. ; load A10
  6552. MOVLPS XMM0, [RAX+32] ; XMM0 := [a20,a21,-,-]
  6553. MOVHPS XMM0, [RAX+48] ; XMM0 := [a20,a21,a30,a31]
  6554. ; load A11
  6555. MOVLPS XMM1, [RAX+40] ; XMM1 := [a22,a23,-,-]
  6556. MOVHPS XMM1, [RAX+56] ; XMM1 := [a22,a23,a32,a33]
  6557. ; multiply A10(D)*B01(E) (use MatMulR2x2 code)
  6558. MOVAPS XMM6, XMM3
  6559. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6560. MULPS XMM6, XMM0
  6561. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6562. MOVAPS XMM7, XMM3
  6563. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6564. MULPS XMM7, XMM0
  6565. ADDPS XMM7, XMM6
  6566. ; multiply A11(D)*B11(E)
  6567. MOVAPS XMM0, XMM5
  6568. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6569. MULPS XMM0, XMM1
  6570. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6571. MOVAPS XMM6, XMM5
  6572. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6573. MULPS XMM6, XMM1
  6574. ADDPS XMM6, XMM0
  6575. ADDPS XMM7, XMM6
  6576. MOVLPS [RCX+40], XMM7
  6577. MOVHPS [RCX+56], XMM7
  6578. END MatMulR4x4;
  6579. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  6580. (* FIXME: speed it up when horizontal add is available!!! *)
  6581. PROCEDURE MatVecMulR2x2(dadr, ladr, radr: ADDRESS);
  6582. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  6583. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  6584. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  6585. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  6586. ; load the whole matrix
  6587. MOVUPS XMM0, [RAX] ; XMM0 := [a00,a01,a10,a11]
  6588. MOVLPS XMM1, [RBX] ; XMM1 := [b00,b01,-,-]
  6589. MOVLHPS XMM1, XMM1 ; XMM1 := [b00,b10,b00,b10]
  6590. MULPS XMM0, XMM1 ; XMM0 := [a00*b00,a01*b10,a10*b00,a11*b10]
  6591. MOVAPS XMM1, XMM0
  6592. SHUFPS XMM0, XMM0, 8; XMM0 := [a00*b00,a10*b00,-,-]
  6593. SHUFPS XMM1, XMM1, 13; XMM1 := [a01*b10,a11*b10,-,-]
  6594. ADDPS XMM0, XMM1
  6595. MOVLPS [RCX], XMM0
  6596. END MatVecMulR2x2;
  6597. (* PH *)
  6598. (* to do: use MOVAPS when Felix fixes issues with alignment *)
  6599. PROCEDURE MatVecMulR4x4(dadr, ladr, radr: ADDRESS);
  6600. CODE{SYSTEM.AMD64, SYSTEM.SSE3}
  6601. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  6602. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  6603. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  6604. MOVUPS XMM0, [RBX] ; XMM0 := [b0,b1,b2,b3]
  6605. MOVUPS XMM1, [RAX] ; XMM1 := [a00,a01,a02,a03]
  6606. MOVUPS XMM2, [RAX+16] ; XMM2 := [a10,a11,a12,a13]
  6607. MOVUPS XMM3, [RAX+32] ; XMM3 := [a20,a21,a22,a23]
  6608. MOVUPS XMM4, [RAX+48] ; XMM4 := [a30,a31,a32,a33]
  6609. MULPS XMM1, XMM0
  6610. MULPS XMM2, XMM0
  6611. HADDPS XMM1, XMM2 ; adjacent pairs are horizontally added
  6612. MULPS XMM3, XMM0
  6613. MULPS XMM4, XMM0
  6614. HADDPS XMM3, XMM4 ; adjacent pairs are horizontally added
  6615. HADDPS XMM1, XMM3 ; adjacent pairs are horizontally added
  6616. MOVUPS [RCX], XMM1
  6617. END MatVecMulR4x4;
  6618. PROCEDURE InstallMatMul*(context: Commands.Context);
  6619. VAR type: LONGINT; string: ARRAY 32 OF CHAR;
  6620. BEGIN
  6621. context.arg.String(string);
  6622. IF string = "dynamic" THEN
  6623. type := cMatMulDynamic;
  6624. ELSIF string = "scalarproduct" THEN
  6625. type := cMatMulScalarProduct
  6626. ELSIF string = "naive" THEN
  6627. type := cMatMulNaive
  6628. ELSIF string = "transposed" THEN
  6629. type := cMatMulTransposed
  6630. ELSIF string = "stride" THEN
  6631. type := cMatMulStride
  6632. ELSIF string ="blocked" THEN
  6633. type := cMatMulBlocked
  6634. ELSE
  6635. KernelLog.String("unknown method: "); KernelLog.String(string); KernelLog.Ln;
  6636. type := cMatMulDynamic;
  6637. END;
  6638. SetMatMulMethod( type );
  6639. END InstallMatMul;
  6640. PROCEDURE InstallAsm*;
  6641. BEGIN
  6642. KernelLog.String( "ASM " );
  6643. ArrayBase.loopSPAXAX := SPAXAXLoopA;
  6644. ArrayBase.loopSPARAR := SPARARLoopA;
  6645. ArrayBase.loopAddAXAX := AddAXAXLoopA;
  6646. ArrayBase.loopAddARAR := AddARARLoopA;
  6647. ArrayBase.loopSubAXAX := SubAXAXLoopA;
  6648. ArrayBase.loopSubARAR := SubARARLoopA;
  6649. ArrayBase.loopEMulAXAX := EMulAXAXLoopA;
  6650. ArrayBase.loopEMulARAR := EMulARARLoopA;
  6651. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
  6652. ArrayBase.loopMatMulARAR := MatMulARARLoopA;
  6653. ArrayBase.loopMulAXSX := MulAXSXLoopA;
  6654. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopA;
  6655. ArrayBase.loopMulARSR := MulARSRLoopA;
  6656. ArrayBase.loopIncMulARSR := IncMulARSRLoopA;
  6657. ArrayBase.loopMatMulIncAXAX := MatMulIncAXAXLoopA;
  6658. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopA;
  6659. ArrayBase.transpose4 := Transpose4;
  6660. ArrayBase.transpose8 := Transpose8;
  6661. END InstallAsm;
  6662. PROCEDURE InstallSSE*;
  6663. BEGIN
  6664. IF Machine.SSESupport THEN
  6665. KernelLog.String( "SSE " );
  6666. ArrayBase.loopSPARAR := SPARARLoopSSE;
  6667. ArrayBase.loopAddARAR := AddARARLoopSSE;
  6668. ArrayBase.loopSubARAR := SubARARLoopSSE;
  6669. ArrayBase.loopEMulARAR := EMulARARLoopSSE;
  6670. ArrayBase.loopMulARSR := MulARSRLoopSSE;
  6671. ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
  6672. ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
  6673. ArrayBase.matMulR := MatMulR;
  6674. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopSSE;
  6675. ArrayBase.matMulIncR := MatMulIncR;
  6676. (* optimizations for small matrices (Alexey Morozov) *)
  6677. ArrayBase.matMulR2x2 := MatMulR2x2;
  6678. ArrayBase.matMulR3x3 := MatMulR3x3;
  6679. ArrayBase.matMulR4x4 := MatMulR4x4;
  6680. ArrayBase.matVecMulR2x2 := MatVecMulR2x2;
  6681. END;
  6682. END InstallSSE;
  6683. PROCEDURE InstallSSE2*; (* extra for testing, will be merged with Install in later versions *)
  6684. BEGIN
  6685. IF Machine.SSE2Support THEN
  6686. KernelLog.String( "SSE2 " );
  6687. ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
  6688. ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
  6689. ArrayBase.loopSubAXAX := SubAXAXLoopSSE;
  6690. ArrayBase.loopEMulAXAX := EMulAXAXLoopSSE;
  6691. ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
  6692. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
  6693. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;
  6694. ArrayBase.matMulX := MatMulX;
  6695. ArrayBase.loopMatMulIncAXAX :=
  6696. MatMulIncAXAXLoopSSE;
  6697. ArrayBase.matMulIncX := MatMulIncX;
  6698. END;
  6699. END InstallSSE2;
  6700. (*! to do: at current, this only works for Win, not for native because SSE3Support is not yet implemented in BIOS.I386.Machine.Mod*)
  6701. PROCEDURE InstallSSE3*; (* extra for testing, will be merged with Install in later versions *)
  6702. BEGIN
  6703. IF Machine.SSE3Support THEN
  6704. KernelLog.String( "SSE3 " );
  6705. (* optimizations for small matrices *)
  6706. ArrayBase.matVecMulR4x4 := MatVecMulR4x4;
  6707. END;
  6708. END InstallSSE3;
  6709. PROCEDURE Install*;
  6710. BEGIN
  6711. KernelLog.String( "ArrayBaseOptimized: installing runtime library optimizations:" );
  6712. InstallAsm; InstallSSE; InstallSSE2; InstallSSE3;
  6713. KernelLog.String( " done." ); KernelLog.Ln;
  6714. END Install;
  6715. PROCEDURE SetParameters*( context: Commands.Context );
  6716. BEGIN
  6717. context.arg.SkipWhitespace; context.arg.Int(cBlockSize,TRUE);
  6718. context.arg.SkipWhitespace; context.arg.Int(nrProcesses,TRUE);
  6719. IF nrProcesses > maxProcesses THEN
  6720. nrProcesses := maxProcesses
  6721. ELSIF nrProcesses = 0 THEN nrProcesses := LONGINT (Machine.NumberOfProcessors());
  6722. END;
  6723. KernelLog.String( "BlockSize=" ); KernelLog.Int( cBlockSize, 0 );
  6724. KernelLog.String( ", NrProcesses = " ); KernelLog.Int( nrProcesses, 0 ); KernelLog.Ln;
  6725. END SetParameters;
  6726. BEGIN
  6727. cBlockSize := 0; (* automatic *)
  6728. nrProcesses := LONGINT (Machine.NumberOfProcessors()); (* automatic *)
  6729. allocT := 0; copyT := 0; compT := 0;
  6730. NEW( cachePool );
  6731. END FoxArrayBaseOptimized.
  6732. System.Free ArrayBaseOptimized ~
  6733. ArrayBaseOptimized.Install ~
  6734. ArrayBaseOptimized.InstallSSE2 ~
  6735. ArrayBaseOptimized.InstallSSE ~
  6736. ArrayBaseOptimized.InstallAsm ~
  6737. ArrayBaseOptimized.InstallMatMul dynamic ~
  6738. ArrayBaseOptimized.InstallMatMul scalarproduct ~
  6739. ArrayBaseOptimized.InstallMatMul transposed ~
  6740. ArrayBaseOptimized.InstallMatMul naive ~
  6741. ArrayBaseOptimized.InstallMatMul stride ~
  6742. ArrayBaseOptimized.InstallMatMul blocked ~
  6743. ArrayBaseOptimized.SetParameters 0 1 ~ (* BlockSize, NrProcesses *)