AMD64.FoxArrayBaseOptimized.Mod 180 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552
  1. MODULE FoxArrayBaseOptimized; (** AUTHOR "fof"; PURPOSE ""; **)
  2. IMPORT SYSTEM, ArrayBase := FoxArrayBase, Machine, KernelLog, Commands ;
  3. CONST
  4. L2CacheSize = 512 * 1024; (* L1CacheSize = 16 * 1024; *)
  5. (* parameters for blocking matrix multiplication *)
  6. L1BlockN = 5; (* L1 block size -> nr of columns in a block that can be processed using L1 chache *)
  7. L2BARatio = 1;
  8. L0BlockKR = 4; (* L0 block size -> nr of elements that can be processed at once for type REAL *)
  9. L1MaxBlockKR = 336; (* L1CacheSize/SIZEOF(REAL)/2/6*)
  10. L2BlockSize = 81920;
  11. L0BlockKX = 2; (* L0 block size -> nr of elements that can be processed at once for type LONGREAL *)
  12. L1MaxBlockKX = 256; (* > L1CacheSize/SIZEOF(LONGREAL)/2/6*)
  13. (*
  14. DefaultL2CacheSize = 81920;
  15. L2SizeR = L2CacheSize DIV 8; MaxBlockKR = 336; (* ca L1CacheSize/SIZEOF(REAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  16. L2SizeX = L2CacheSize DIV 8; MaxBlockKX = 256; (* bit more than L1CacheSize/SIZEL(LONGREAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  17. *)
  18. debug = FALSE; parallel = TRUE; SSE = TRUE;
  19. MaxCachePoolSize = 0 (* disabled *) (* 646*1024*1024 *) (* enabled *) ;
  20. maxProcesses = 32;
  21. cMatMulDynamic* = -1; cMatMulScalarProduct* = 0;
  22. cMatMulNaive* = 1; cMatMulTransposed* = 2;
  23. cMatMulStride* = 3; cMatMulBlocked* = 4;
  24. VAR
  25. cBlockSize*: LONGINT; nrProcesses*: LONGINT;
  26. lastUsedBlockSize*: SIZE;
  27. allocT-, copyT-, zeroT-, compT-: HUGEINT;
  28. TYPE
  29. Cache = POINTER TO RECORD
  30. p: ANY;
  31. adr: ADDRESS; size: SIZE;
  32. prev, next: Cache;
  33. END;
  34. CachePool = OBJECT
  35. (*! provide heuristics for overal size *)
  36. VAR first, last: Cache;
  37. PROCEDURE & Init*;
  38. BEGIN
  39. NEW( first ); first.size := 0; (* sentinel *)
  40. NEW( last ); last.size := MAX( LONGINT ); (* sentinel *)
  41. first.next := last; first.prev := NIL; last.prev := first; last.next := NIL;
  42. END Init;
  43. PROCEDURE Acquire( size: SIZE ): Cache;
  44. VAR c: Cache; t: HUGEINT;
  45. BEGIN {EXCLUSIVE}
  46. IF size = 0 THEN RETURN first END;
  47. Tic( t );
  48. c := last;
  49. WHILE (c.prev.size >= size) DO
  50. c := c.prev;
  51. END;
  52. IF c = last THEN
  53. NEW( c ); SYSTEM.NEW( c.p, size + 16 );
  54. c.adr := Align( c.p , 16 );
  55. c.size := size;
  56. ELSE
  57. c.prev.next := c.next;
  58. c.next.prev := c.prev;
  59. c.prev := NIL; c.next := NIL;
  60. END;
  61. Toc( t, allocT ); RETURN c;
  62. END Acquire;
  63. PROCEDURE Release( c: Cache );
  64. VAR t: Cache;
  65. BEGIN {EXCLUSIVE}
  66. IF (c=first) OR (c=NIL) THEN RETURN END;
  67. ASSERT(c.size > 0);
  68. IF c.size > MaxCachePoolSize THEN RETURN END;
  69. t := first;
  70. WHILE (t.size <= c.size) DO t := t.next; END;
  71. c.prev := t.prev; c.next := t; t.prev := c; c.prev.next := c;
  72. END Release;
  73. END CachePool;
  74. ComputationObj = OBJECT
  75. VAR done: BOOLEAN;
  76. PROCEDURE & Init*;
  77. BEGIN
  78. done := FALSE;
  79. END Init;
  80. PROCEDURE Compute; (*abstract*)
  81. END Compute;
  82. PROCEDURE Wait;
  83. BEGIN {EXCLUSIVE}
  84. AWAIT( done );
  85. END Wait;
  86. BEGIN {ACTIVE, EXCLUSIVE}
  87. Compute; done := TRUE;
  88. END ComputationObj;
  89. MatMulHObjR = OBJECT (ComputationObj)
  90. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  91. add: BOOLEAN;
  92. PROCEDURE & InitR*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  93. add: BOOLEAN );
  94. BEGIN
  95. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  96. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  97. SELF.IncC := IncC; SELF.StrideC := StrideC;
  98. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  99. SELF.Cols := Cols; SELF.add := add;
  100. END InitR;
  101. PROCEDURE Compute;
  102. BEGIN
  103. MatMulHBlockR( MatrixA, MatrixB, MatrixC, Stride, IncC,
  104. StrideC, RowsA, RowsB, Cols, add );
  105. END Compute;
  106. END MatMulHObjR;
  107. MatMulHObjX = OBJECT (ComputationObj)
  108. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  109. add: BOOLEAN;
  110. PROCEDURE & InitX*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  111. add: BOOLEAN );
  112. BEGIN
  113. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  114. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  115. SELF.IncC := IncC; SELF.StrideC := StrideC;
  116. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  117. SELF.Cols := Cols; SELF.add := add;
  118. END InitX;
  119. PROCEDURE Compute;
  120. BEGIN
  121. MatMulHBlockX( MatrixA, MatrixB, MatrixC, Stride, IncC,
  122. StrideC, RowsA, RowsB, Cols, add );
  123. END Compute;
  124. END MatMulHObjX;
  125. MultiplyObjectR = OBJECT (ComputationObj);
  126. VAR adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK:SIZE;
  127. start, finished: BOOLEAN;
  128. PROCEDURE & InitR*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  129. BEGIN
  130. Init; start := FALSE; finished := FALSE;
  131. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  132. SELF.M := M; SELF.N := N; SELF.K := K;
  133. SELF.IncC := IncC; SELF.StrideC := StrideC;
  134. SELF.L2BlockM := L2BlockM;
  135. SELF.L2BlockN := L2BlockN;
  136. SELF.L2BlockK := L2BlockK;
  137. END InitR;
  138. PROCEDURE Compute;
  139. BEGIN
  140. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  141. L2BlockN, L2BlockK );
  142. END Compute;
  143. END MultiplyObjectR;
  144. MultiplyObjectX = OBJECT (ComputationObj);
  145. VAR adrA, adrB:ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE;
  146. start, finished: BOOLEAN;
  147. PROCEDURE & InitX*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  148. BEGIN
  149. Init; start := FALSE; finished := FALSE;
  150. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  151. SELF.M := M; SELF.N := N; SELF.K := K;
  152. SELF.IncC := IncC; SELF.StrideC := StrideC;
  153. SELF.L2BlockM := L2BlockM;
  154. SELF.L2BlockN := L2BlockN;
  155. SELF.L2BlockK := L2BlockK;
  156. END InitX;
  157. PROCEDURE Compute;
  158. BEGIN
  159. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  160. L2BlockN, L2BlockK );
  161. END Compute;
  162. END MultiplyObjectX;
  163. VAR
  164. (* ran: Random.Generator; (* testing *)*)
  165. cachePool: CachePool;
  166. (*********** Part 0: assembler routines ***************)
  167. PROCEDURE -L1Block1XA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  168. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  169. MOV RAX, [RSP+K] ; RAX IS counter
  170. MOV RDX, [RSP+adrC]
  171. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  172. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  173. FLD QWORD [RDX] ; S.GET(dadr, x)
  174. loop8:
  175. CMP RAX, 8
  176. JL loop1
  177. FLD QWORD[RBX] ; S.GET(ladr, x)
  178. ADD RBX, 8 ; INC(ladr, incl)
  179. FLD QWORD[RCX] ; S.GET(ladr, y)
  180. ADD RCX, 8 ; INC(radr, incr)
  181. FMULP ; x := x*y
  182. FADDP ; z := z+x
  183. FLD QWORD[RBX] ; S.GET(ladr, x)
  184. ADD RBX, 8 ; INC(ladr, incl)
  185. FLD QWORD[RCX] ; S.GET(ladr, y)
  186. ADD RCX, 8 ; INC(radr, incr)
  187. FMULP ; x := x*y
  188. FADDP ; z := z+x
  189. FLD QWORD[RBX] ; S.GET(ladr, x)
  190. ADD RBX, 8 ; INC(ladr, incl)
  191. FLD QWORD[RCX] ; S.GET(ladr, y)
  192. ADD RCX, 8 ; INC(radr, incr)
  193. FMULP ; x := x*y
  194. FADDP ; z := z+x
  195. FLD QWORD[RBX] ; S.GET(ladr, x)
  196. ADD RBX, 8 ; INC(ladr, incl)
  197. FLD QWORD[RCX] ; S.GET(ladr, y)
  198. ADD RCX, 8 ; INC(radr, incr)
  199. FMULP ; x := x*y
  200. FADDP ; z := z+x
  201. FLD QWORD[RBX] ; S.GET(ladr, x)
  202. ADD RBX, 8 ; INC(ladr, incl)
  203. FLD QWORD[RCX] ; S.GET(ladr, y)
  204. ADD RCX, 8 ; INC(radr, incr)
  205. FMULP ; x := x*y
  206. FADDP ; z := z+x
  207. FLD QWORD[RBX] ; S.GET(ladr, x)
  208. ADD RBX, 8 ; INC(ladr, incl)
  209. FLD QWORD[RCX] ; S.GET(ladr, y)
  210. ADD RCX, 8 ; INC(radr, incr)
  211. FMULP ; x := x*y
  212. FADDP ; z := z+x
  213. FLD QWORD[RBX] ; S.GET(ladr, x)
  214. ADD RBX, 8 ; INC(ladr, incl)
  215. FLD QWORD[RCX] ; S.GET(ladr, y)
  216. ADD RCX, 8 ; INC(radr, incr)
  217. FMULP ; x := x*y
  218. FADDP ; z := z+x
  219. FLD QWORD[RBX] ; S.GET(ladr, x)
  220. ADD RBX, 8 ; INC(ladr, incl)
  221. FLD QWORD[RCX] ; S.GET(ladr, y)
  222. ADD RCX, 8 ; INC(radr, incr)
  223. FMULP ; x := x*y
  224. FADDP ; z := z+x
  225. SUB RAX, 8 ; DEC(len)
  226. JMP loop8 ;
  227. loop1:
  228. CMP RAX, 0 ; WHILE len > 0 DO
  229. JLE endL
  230. FLD QWORD[RBX] ; S.GET(ladr, x)
  231. ADD RBX, 8 ; INC(ladr, incl)
  232. FLD QWORD[RCX] ; S.GET(ladr, y)
  233. ADD RCX, 8 ; INC(radr, incr)
  234. FMULP ; x := x*y
  235. FADDP ; z := z+x
  236. DEC RAX ; DEC(len)
  237. JMP loop1 ;
  238. endL:
  239. FSTP QWORD[RDX] ; S.PUT(dadr, x)
  240. FWAIT ;
  241. ADD RSP, 32 ;
  242. END L1Block1XA;
  243. PROCEDURE -L1Block1XSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  244. (*
  245. matrixA, matrixB must be stored in special format
  246. K>0 guaranteed
  247. *)
  248. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  249. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  250. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  251. MOV RDX, [RSP+K] ; RDX IS counter
  252. XORPD XMM2, XMM2 ;
  253. kLoop8: ;
  254. CMP RDX, 8 ;
  255. JL kLoop2 ;
  256. MOVAPD XMM7, [RBX] ;
  257. MOVAPD XMM0, [RCX] ;
  258. ADD RCX, 16 ;
  259. ADD RBX, 16 ;
  260. MOVAPD XMM6, [RBX] ;
  261. MOVAPD XMM1, [RCX] ;
  262. ADD RCX, 16 ;
  263. ADD RBX, 16 ;
  264. MULPD XMM0, XMM7 ;
  265. ADDPD XMM2, XMM0 ;
  266. MOVAPD XMM5, [RBX] ;
  267. MOVAPD XMM3, [RCX] ;
  268. ADD RCX, 16 ;
  269. ADD RBX, 16 ;
  270. MULPD XMM1, XMM6 ;
  271. ADDPD XMM2, XMM1 ;
  272. MOVAPD XMM7, [RBX] ;
  273. MOVAPD XMM0, [RCX] ;
  274. ADD RCX, 16 ;
  275. ADD RBX, 16 ;
  276. MULPD XMM3, XMM5 ;
  277. ADDPD XMM2, XMM3 ;
  278. MULPD XMM0, XMM7 ;
  279. ADDPD XMM2, XMM0 ;
  280. SUB RDX, 8 ;
  281. JMP kLoop8 ;
  282. kLoop2: ;
  283. CMP RDX, 0 ;
  284. JLE horizontalAdd ;
  285. MOVAPD XMM7, [RBX] ;
  286. MOVAPD XMM0, [RCX] ;
  287. ADD RCX, 16 ;
  288. ADD RBX, 16 ;
  289. MULPD XMM0, XMM7 ;
  290. ADDPD XMM2, XMM0 ;
  291. SUB RDX, 2
  292. JMP kLoop2 ;
  293. horizontalAdd:
  294. MOV RDI, [RSP+adrC] ;
  295. MOVAPD XMM1, XMM2 ;
  296. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  297. ADDPD XMM2, XMM1 ;
  298. ADDSD XMM2, [RDI] ;
  299. MOVSD [RDI], XMM2 ;
  300. endL:
  301. ADD RSP, 32 ;
  302. END L1Block1XSSE;
  303. PROCEDURE -L1Block5XSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  304. (*
  305. matrixA and matrix B are stored in special format !
  306. K > 0 is guaranteed
  307. *)
  308. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  309. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  310. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  311. MOV RDX, [RSP+K] ; RDX IS counter
  312. XORPD XMM2, XMM2 ;
  313. XORPD XMM3, XMM3 ;
  314. XORPD XMM4, XMM4 ;
  315. XORPD XMM5, XMM5 ;
  316. XORPD XMM6, XMM6 ;
  317. kLoop8: ;
  318. CMP RDX, 8 ;
  319. JL kLoop2
  320. ; (*-- 0 -- *) ;
  321. MOVAPD XMM7, [RBX] ; get 4 elements OF A
  322. ADD RBX, 16 ;
  323. MOVAPD XMM0, [RCX] ; get 4 elements OF B
  324. ADD RCX, 16 ;
  325. MOVAPD XMM1, [RCX] ; get 4 elements OF B
  326. ADD RCX, 16 ;
  327. MULPD XMM0, XMM7 ;
  328. ADDPD XMM2, XMM0 ;
  329. MOVAPD XMM0, [RCX] ;
  330. ADD RCX, 16 ;
  331. MULPD XMM1, XMM7 ;
  332. ADDPD XMM3, XMM1 ;
  333. MOVAPD XMM1, [RCX] ;
  334. ADD RCX, 16 ;
  335. MULPD XMM0, XMM7 ;
  336. ADDPD XMM4, XMM0 ;
  337. MOVAPD XMM0, [RCX] ;
  338. ADD RCX, 16 ;
  339. MULPD XMM1, XMM7 ;
  340. ADDPD XMM5, XMM1 ;
  341. MOVAPD XMM1, [RCX] ;
  342. ADD RCX, 16 ;
  343. MULPD XMM0, XMM7 ;
  344. ADDPD XMM6, XMM0
  345. ; (*-- 2 -- *) ;
  346. MOVAPD XMM7, [RBX] ;
  347. ADD RBX, 16 ;
  348. MOVAPD XMM0, [RCX] ;
  349. ADD RCX, 16 ;
  350. MULPD XMM1, XMM7 ;
  351. ADDPD XMM2, XMM1 ;
  352. MOVAPD XMM1, [RCX] ;
  353. ADD RCX, 16 ;
  354. MULPD XMM0, XMM7 ;
  355. ADDPD XMM3, XMM0 ;
  356. MOVAPD XMM0, [RCX] ;
  357. ADD RCX, 16 ;
  358. MULPD XMM1, XMM7 ;
  359. ADDPD XMM4, XMM1 ;
  360. MOVAPD XMM1, [RCX] ;
  361. ADD RCX, 16 ;
  362. MULPD XMM0, XMM7 ;
  363. ADDPD XMM5, XMM0 ;
  364. MOVAPD XMM0, [RCX] ;
  365. ADD RCX, 16 ;
  366. MULPD XMM1, XMM7 ;
  367. ADDPD XMM6, XMM1
  368. ; (*-- 4 -- *) ;
  369. MOVAPD XMM7, [RBX] ;
  370. ADD RBX, 16 ;
  371. MOVAPD XMM1, [RCX] ;
  372. ADD RCX, 16 ;
  373. MULPD XMM0, XMM7 ;
  374. ADDPD XMM2, XMM0 ;
  375. MOVAPD XMM0, [RCX] ;
  376. ADD RCX, 16 ;
  377. MULPD XMM1, XMM7 ;
  378. ADDPD XMM3, XMM1 ;
  379. MOVAPD XMM1, [RCX] ;
  380. ADD RCX, 16 ;
  381. MULPD XMM0, XMM7 ;
  382. ADDPD XMM4, XMM0 ;
  383. MOVAPD XMM0, [RCX] ;
  384. ADD RCX, 16 ;
  385. MULPD XMM1, XMM7 ;
  386. ADDPD XMM5, XMM1 ;
  387. MOVAPD XMM1, [RCX] ;
  388. ADD RCX, 16 ;
  389. MULPD XMM0, XMM7 ;
  390. ADDPD XMM6, XMM0
  391. ; (*-- 6 -- *) ;
  392. MOVAPD XMM7, [RBX] ;
  393. ADD RBX, 16 ;
  394. MOVAPD XMM0, [RCX] ;
  395. ADD RCX, 16 ;
  396. MULPD XMM1, XMM7 ;
  397. ADDPD XMM2, XMM1 ;
  398. MOVAPD XMM1, [RCX] ;
  399. ADD RCX, 16 ;
  400. MULPD XMM0, XMM7 ;
  401. ADDPD XMM3, XMM0 ;
  402. MOVAPD XMM0, [RCX] ;
  403. ADD RCX, 16 ;
  404. MULPD XMM1, XMM7 ;
  405. ADDPD XMM4, XMM1 ;
  406. MOVAPD XMM1, [RCX] ;
  407. ADD RCX, 16 ;
  408. MULPD XMM0, XMM7 ;
  409. ADDPD XMM5, XMM0 ;
  410. MULPD XMM1, XMM7 ;
  411. ADDPD XMM6, XMM1 ;
  412. SUB RDX, 8
  413. JMP kLoop8 ;
  414. kLoop2: ;
  415. CMP RDX, 0 ;
  416. JLE horizontalAdd ;
  417. MOVAPD XMM7, [RBX] ;
  418. ADD RBX, 16 ;
  419. MOVAPD XMM0, [RCX] ;
  420. ADD RCX, 16 ;
  421. MOVAPD XMM1, [RCX] ;
  422. ADD RCX, 16 ;
  423. MULPD XMM0, XMM7 ;
  424. ADDPD XMM2, XMM0 ;
  425. MOVAPD XMM0, [RCX] ;
  426. ADD RCX, 16 ;
  427. MULPD XMM1, XMM7 ;
  428. ADDPD XMM3, XMM1 ;
  429. MOVAPD XMM1, [RCX] ;
  430. ADD RCX, 16 ;
  431. MULPD XMM0, XMM7 ;
  432. ADDPD XMM4, XMM0 ;
  433. MOVAPD XMM0, [RCX] ;
  434. ADD RCX, 16 ;
  435. MULPD XMM1, XMM7 ;
  436. ADDPD XMM5, XMM1 ;
  437. MULPD XMM0, XMM7 ;
  438. ADDPD XMM6, XMM0 ;
  439. SUB RDX, 2
  440. JMP kLoop2 ;
  441. horizontalAdd: ; add and store
  442. MOV RDI, [RSP+adrC] ;
  443. MOV RAX, [RSP+IncC] ;
  444. MOVAPD XMM1, XMM2 ;
  445. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  446. ADDPD XMM2, XMM1 ;
  447. ADDSD XMM2, [RDI] ;
  448. MOVSD [RDI], XMM2 ;
  449. ADD RDI, RAX ;
  450. MOVAPD XMM1, XMM3 ;
  451. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  452. ADDPD XMM3, XMM1 ;
  453. ADDSD XMM3, [RDI] ;
  454. MOVSD [RDI], XMM3 ;
  455. ADD RDI, RAX ;
  456. MOVAPD XMM1, XMM4 ;
  457. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  458. ADDPD XMM4, XMM1 ;
  459. ADDSD XMM4, [RDI] ;
  460. MOVSD [RDI], XMM4 ;
  461. ADD RDI, RAX ;
  462. MOVAPD XMM1, XMM5 ;
  463. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  464. ADDPD XMM5, XMM1 ;
  465. ADDSD XMM5, [RDI] ;
  466. MOVSD [RDI], XMM5 ;
  467. ADD RDI, RAX ;
  468. MOVAPD XMM1, XMM6 ;
  469. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  470. ADDPD XMM6, XMM1 ;
  471. ADDSD XMM6, [RDI] ;
  472. MOVSD [RDI], XMM6 ;
  473. endL:
  474. ADD RSP, 40 ;
  475. END L1Block5XSSE;
  476. PROCEDURE -L1Block1RA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  477. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  478. MOV RAX, [RSP+K] ; RAX IS counter
  479. MOV RDX, [RSP+adrC]
  480. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  481. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  482. FLD DWORD [RDX] ; S.GET(dadr, x)
  483. loop16:
  484. CMP RAX, 16
  485. JL loop1
  486. FLD DWORD[RBX] ; S.GET(ladr, x)
  487. ADD RBX, 4 ; INC(ladr, incl)
  488. FLD DWORD[RCX] ; S.GET(ladr, y)
  489. ADD RCX, 4 ; INC(radr, incr)
  490. FMULP ; x := x*y
  491. FADDP ; z := z+x
  492. FLD DWORD[RBX] ; S.GET(ladr, x)
  493. ADD RBX, 4 ; INC(ladr, incl)
  494. FLD DWORD[RCX] ; S.GET(ladr, y)
  495. ADD RCX, 4 ; INC(radr, incr)
  496. FMULP ; x := x*y
  497. FADDP ; z := z+x
  498. FLD DWORD[RBX] ; S.GET(ladr, x)
  499. ADD RBX, 4 ; INC(ladr, incl)
  500. FLD DWORD[RCX] ; S.GET(ladr, y)
  501. ADD RCX, 4 ; INC(radr, incr)
  502. FMULP ; x := x*y
  503. FADDP ; z := z+x
  504. FLD DWORD[RBX] ; S.GET(ladr, x)
  505. ADD RBX, 4 ; INC(ladr, incl)
  506. FLD DWORD[RCX] ; S.GET(ladr, y)
  507. ADD RCX, 4 ; INC(radr, incr)
  508. FMULP ; x := x*y
  509. FADDP ; z := z+x
  510. FLD DWORD[RBX] ; S.GET(ladr, x)
  511. ADD RBX, 4 ; INC(ladr, incl)
  512. FLD DWORD[RCX] ; S.GET(ladr, y)
  513. ADD RCX, 4 ; INC(radr, incr)
  514. FMULP ; x := x*y
  515. FADDP ; z := z+x
  516. FLD DWORD[RBX] ; S.GET(ladr, x)
  517. ADD RBX, 4 ; INC(ladr, incl)
  518. FLD DWORD[RCX] ; S.GET(ladr, y)
  519. ADD RCX, 4 ; INC(radr, incr)
  520. FMULP ; x := x*y
  521. FADDP ; z := z+x
  522. FLD DWORD[RBX] ; S.GET(ladr, x)
  523. ADD RBX, 4 ; INC(ladr, incl)
  524. FLD DWORD[RCX] ; S.GET(ladr, y)
  525. ADD RCX, 4 ; INC(radr, incr)
  526. FMULP ; x := x*y
  527. FADDP ; z := z+x
  528. FLD DWORD[RBX] ; S.GET(ladr, x)
  529. ADD RBX, 4 ; INC(ladr, incl)
  530. FLD DWORD[RCX] ; S.GET(ladr, y)
  531. ADD RCX, 4 ; INC(radr, incr)
  532. FMULP ; x := x*y
  533. FADDP ; z := z+x
  534. FLD DWORD[RBX] ; S.GET(ladr, x)
  535. ADD RBX, 4 ; INC(ladr, incl)
  536. FLD DWORD[RCX] ; S.GET(ladr, y)
  537. ADD RCX, 4 ; INC(radr, incr)
  538. FMULP ; x := x*y
  539. FADDP ; z := z+x
  540. FLD DWORD[RBX] ; S.GET(ladr, x)
  541. ADD RBX, 4 ; INC(ladr, incl)
  542. FLD DWORD[RCX] ; S.GET(ladr, y)
  543. ADD RCX, 4 ; INC(radr, incr)
  544. FMULP ; x := x*y
  545. FADDP ; z := z+x
  546. FLD DWORD[RBX] ; S.GET(ladr, x)
  547. ADD RBX, 4 ; INC(ladr, incl)
  548. FLD DWORD[RCX] ; S.GET(ladr, y)
  549. ADD RCX, 4 ; INC(radr, incr)
  550. FMULP ; x := x*y
  551. FADDP ; z := z+x
  552. FLD DWORD[RBX] ; S.GET(ladr, x)
  553. ADD RBX, 4 ; INC(ladr, incl)
  554. FLD DWORD[RCX] ; S.GET(ladr, y)
  555. ADD RCX, 4 ; INC(radr, incr)
  556. FMULP ; x := x*y
  557. FADDP ; z := z+x
  558. FLD DWORD[RBX] ; S.GET(ladr, x)
  559. ADD RBX, 4 ; INC(ladr, incl)
  560. FLD DWORD[RCX] ; S.GET(ladr, y)
  561. ADD RCX, 4 ; INC(radr, incr)
  562. FMULP ; x := x*y
  563. FADDP ; z := z+x
  564. FLD DWORD[RBX] ; S.GET(ladr, x)
  565. ADD RBX, 4 ; INC(ladr, incl)
  566. FLD DWORD[RCX] ; S.GET(ladr, y)
  567. ADD RCX, 4 ; INC(radr, incr)
  568. FMULP ; x := x*y
  569. FADDP ; z := z+x
  570. FLD DWORD[RBX] ; S.GET(ladr, x)
  571. ADD RBX, 4 ; INC(ladr, incl)
  572. FLD DWORD[RCX] ; S.GET(ladr, y)
  573. ADD RCX, 4 ; INC(radr, incr)
  574. FMULP ; x := x*y
  575. FADDP ; z := z+x
  576. FLD DWORD[RBX] ; S.GET(ladr, x)
  577. ADD RBX, 4 ; INC(ladr, incl)
  578. FLD DWORD[RCX] ; S.GET(ladr, y)
  579. ADD RCX, 4 ; INC(radr, incr)
  580. FMULP ; x := x*y
  581. FADDP ; z := z+x
  582. SUB RAX, 16 ; DEC(len)
  583. JMP loop16 ;
  584. loop1:
  585. CMP RAX, 0 ; WHILE len > 0 DO
  586. JLE endL
  587. FLD DWORD[RBX] ; S.GET(ladr, x)
  588. ADD RBX, 4 ; INC(ladr, incl)
  589. FLD DWORD[RCX] ; S.GET(ladr, y)
  590. ADD RCX, 4 ; INC(radr, incr)
  591. FMULP ; x := x*y
  592. FADDP ; z := z+x
  593. DEC RAX ; DEC(len)
  594. JMP loop1 ;
  595. endL:
  596. FSTP DWORD[RDX] ; S.PUT(dadr, x)
  597. FWAIT ;
  598. ADD RSP, 32 ;
  599. END L1Block1RA;
  600. PROCEDURE -L1Block1RSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  601. (*
  602. matrixA, matrixB must be stored in special format
  603. K>0 guaranteed
  604. *)
  605. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  606. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  607. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  608. MOV RDX, [RSP+K] ; RDX IS counter
  609. XORPS XMM2, XMM2 ;
  610. kLoop16: ;
  611. CMP RDX, 16 ;
  612. JL kLoop4 ;
  613. MOVAPS XMM7, [RBX] ;
  614. MOVAPS XMM0, [RCX] ;
  615. ADD RCX, 16 ;
  616. ADD RBX, 16 ;
  617. MOVAPS XMM6, [RBX] ;
  618. MOVAPS XMM1, [RCX] ;
  619. ADD RCX, 16 ;
  620. ADD RBX, 16 ;
  621. MULPS XMM0, XMM7 ;
  622. ADDPS XMM2, XMM0 ;
  623. MOVAPS XMM5, [RBX] ;
  624. MOVAPS XMM3, [RCX] ;
  625. ADD RCX, 16 ;
  626. ADD RBX, 16 ;
  627. MULPS XMM1, XMM6 ;
  628. ADDPS XMM2, XMM1 ;
  629. MOVAPS XMM7, [RBX] ;
  630. MOVAPS XMM0, [RCX] ;
  631. ADD RCX, 16 ;
  632. ADD RBX, 16 ;
  633. MULPS XMM3, XMM5 ;
  634. ADDPS XMM2, XMM3 ;
  635. MULPS XMM0, XMM7 ;
  636. ADDPS XMM2, XMM0 ;
  637. SUB RDX, 16 ;
  638. JMP kLoop16 ;
  639. kLoop4: ;
  640. CMP RDX, 0 ;
  641. JLE horizontalAdd ;
  642. MOVAPS XMM7, [RBX] ;
  643. MOVAPS XMM0, [RCX] ;
  644. ADD RCX, 16 ;
  645. ADD RBX, 16 ;
  646. MULPS XMM0, XMM7 ;
  647. ADDPS XMM2, XMM0 ;
  648. SUB RDX, 4
  649. JMP kLoop4 ;
  650. horizontalAdd:
  651. MOV RDI, [RSP+adrC] ;
  652. MOVLHPS XMM1, XMM2 ;
  653. ADDPS XMM1, XMM2 ;
  654. SHUFPS XMM2, XMM1, 48 ;
  655. ADDPS XMM2, XMM1 ;
  656. MOVHLPS XMM2, XMM2 ;
  657. ADDSS XMM2, [RDI] ;
  658. MOVSS [RDI], XMM2 ;
  659. endL:
  660. ADD RSP, 32 ;
  661. END L1Block1RSSE;
  662. PROCEDURE -L1Block5RSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  663. (*
  664. matrixA and matrix B are stored in special format !
  665. K > 0 is guaranteed
  666. *)
  667. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  668. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  669. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  670. MOV RDX, [RSP+K] ; RDX IS counter
  671. XORPS XMM2, XMM2 ;
  672. XORPS XMM3, XMM3 ;
  673. XORPS XMM4, XMM4 ;
  674. XORPS XMM5, XMM5 ;
  675. XORPS XMM6, XMM6 ;
  676. kLoop16: ;
  677. CMP RDX, 16 ;
  678. JL kLoop4 ; (*-- 0 -- *)
  679. MOVAPS XMM7, [RBX] ; get 4 elements OF A
  680. ADD RBX, 16 ;
  681. MOVAPS XMM0, [RCX] ; get 4 elements OF B
  682. ADD RCX, 16 ;
  683. MOVAPS XMM1, [RCX] ; get 4 elements OF B
  684. ADD RCX, 16 ;
  685. MULPS XMM0, XMM7 ;
  686. ADDPS XMM2, XMM0 ;
  687. MOVAPS XMM0, [RCX] ;
  688. ADD RCX, 16 ;
  689. MULPS XMM1, XMM7 ;
  690. ADDPS XMM3, XMM1 ;
  691. MOVAPS XMM1, [RCX] ;
  692. ADD RCX, 16 ;
  693. MULPS XMM0, XMM7 ;
  694. ADDPS XMM4, XMM0 ;
  695. MOVAPS XMM0, [RCX] ;
  696. ADD RCX, 16 ;
  697. MULPS XMM1, XMM7 ;
  698. ADDPS XMM5, XMM1 ;
  699. MOVAPS XMM1, [RCX] ;
  700. ADD RCX, 16 ;
  701. MULPS XMM0, XMM7 ;
  702. ADDPS XMM6, XMM0
  703. ; (*-- 4 -- *) ;
  704. MOVAPS XMM7, [RBX] ;
  705. ADD RBX, 16 ;
  706. MOVAPS XMM0, [RCX] ;
  707. ADD RCX, 16 ;
  708. MULPS XMM1, XMM7 ;
  709. ADDPS XMM2, XMM1 ;
  710. MOVAPS XMM1, [RCX] ;
  711. ADD RCX, 16 ;
  712. MULPS XMM0, XMM7 ;
  713. ADDPS XMM3, XMM0 ;
  714. MOVAPS XMM0, [RCX] ;
  715. ADD RCX, 16 ;
  716. MULPS XMM1, XMM7 ;
  717. ADDPS XMM4, XMM1 ;
  718. MOVAPS XMM1, [RCX] ;
  719. ADD RCX, 16 ;
  720. MULPS XMM0, XMM7 ;
  721. ADDPS XMM5, XMM0 ;
  722. MOVAPS XMM0, [RCX] ;
  723. ADD RCX, 16 ;
  724. MULPS XMM1, XMM7 ;
  725. ADDPS XMM6, XMM1
  726. ; (*-- 8 -- *) ;
  727. MOVAPS XMM7, [RBX] ;
  728. ADD RBX, 16 ;
  729. MOVAPS XMM1, [RCX] ;
  730. ADD RCX, 16 ;
  731. MULPS XMM0, XMM7 ;
  732. ADDPS XMM2, XMM0 ;
  733. MOVAPS XMM0, [RCX] ;
  734. ADD RCX, 16 ;
  735. MULPS XMM1, XMM7 ;
  736. ADDPS XMM3, XMM1 ;
  737. MOVAPS XMM1, [RCX] ;
  738. ADD RCX, 16 ;
  739. MULPS XMM0, XMM7 ;
  740. ADDPS XMM4, XMM0 ;
  741. MOVAPS XMM0, [RCX] ;
  742. ADD RCX, 16 ;
  743. MULPS XMM1, XMM7 ;
  744. ADDPS XMM5, XMM1 ;
  745. MOVAPS XMM1, [RCX] ;
  746. ADD RCX, 16 ;
  747. MULPS XMM0, XMM7 ;
  748. ADDPS XMM6, XMM0
  749. ; (*-- 12 -- *) ;
  750. MOVAPS XMM7, [RBX] ;
  751. ADD RBX, 16 ;
  752. MOVAPS XMM0, [RCX] ;
  753. ADD RCX, 16 ;
  754. MULPS XMM1, XMM7 ;
  755. ADDPS XMM2, XMM1 ;
  756. MOVAPS XMM1, [RCX] ;
  757. ADD RCX, 16 ;
  758. MULPS XMM0, XMM7 ;
  759. ADDPS XMM3, XMM0 ;
  760. MOVAPS XMM0, [RCX] ;
  761. ADD RCX, 16 ;
  762. MULPS XMM1, XMM7 ;
  763. ADDPS XMM4, XMM1 ;
  764. MOVAPS XMM1, [RCX] ;
  765. ADD RCX, 16 ;
  766. MULPS XMM0, XMM7 ;
  767. ADDPS XMM5, XMM0 ;
  768. MULPS XMM1, XMM7 ;
  769. ADDPS XMM6, XMM1 ;
  770. SUB RDX, 16
  771. JMP kLoop16 ;
  772. kLoop4: ;
  773. CMP RDX, 0 ;
  774. JLE horizontalAdd ;
  775. MOVAPS XMM7, [RBX] ;
  776. ADD RBX, 16 ;
  777. MOVAPS XMM0, [RCX] ;
  778. ADD RCX, 16 ;
  779. MOVAPS XMM1, [RCX] ;
  780. ADD RCX, 16 ;
  781. MULPS XMM0, XMM7 ;
  782. ADDPS XMM2, XMM0 ;
  783. MOVAPS XMM0, [RCX] ;
  784. ADD RCX, 16 ;
  785. MULPS XMM1, XMM7 ;
  786. ADDPS XMM3, XMM1 ;
  787. MOVAPS XMM1, [RCX] ;
  788. ADD RCX, 16 ;
  789. MULPS XMM0, XMM7 ;
  790. ADDPS XMM4, XMM0 ;
  791. MOVAPS XMM0, [RCX] ;
  792. ADD RCX, 16 ;
  793. MULPS XMM1, XMM7 ;
  794. ADDPS XMM5, XMM1 ;
  795. MULPS XMM0, XMM7 ;
  796. ADDPS XMM6, XMM0 ;
  797. SUB RDX, 4
  798. JMP kLoop4 ;
  799. horizontalAdd: ; add and store
  800. MOV RDI, [RSP+adrC] ;
  801. MOV RAX, [RSP+IncC] ;
  802. MOVLHPS XMM1, XMM2 ;
  803. ADDPS XMM1, XMM2 ;
  804. SHUFPS XMM2, XMM1, 48 ;
  805. ADDPS XMM2, XMM1 ;
  806. MOVHLPS XMM2, XMM2 ;
  807. ADDSS XMM2, [RDI] ;
  808. MOVSS [RDI], XMM2 ;
  809. ADD RDI, RAX ;
  810. MOVLHPS XMM1, XMM3 ;
  811. ADDPS XMM1, XMM3 ;
  812. SHUFPS XMM3, XMM1, 48 ;
  813. ADDPS XMM3, XMM1 ;
  814. MOVHLPS XMM3, XMM3 ;
  815. ADDSS XMM3, [RDI] ;
  816. MOVSS [RDI], XMM3 ;
  817. ADD RDI, RAX ;
  818. MOVLHPS XMM1, XMM4 ;
  819. ADDPS XMM1, XMM4 ;
  820. SHUFPS XMM4, XMM1, 48 ;
  821. ADDPS XMM4, XMM1 ;
  822. MOVHLPS XMM4, XMM4 ;
  823. ADDSS XMM4, [RDI] ;
  824. MOVSS [RDI], XMM4 ;
  825. ADD RDI, RAX ;
  826. MOVLHPS XMM1, XMM5 ;
  827. ADDPS XMM1, XMM5 ;
  828. SHUFPS XMM5, XMM1, 48 ;
  829. ADDPS XMM5, XMM1 ;
  830. MOVHLPS XMM5, XMM5 ;
  831. ADDSS XMM5, [RDI] ;
  832. MOVSS [RDI], XMM5 ;
  833. ADD RDI, RAX ;
  834. MOVLHPS XMM1, XMM6 ;
  835. ADDPS XMM1, XMM6 ;
  836. SHUFPS XMM6, XMM1, 48 ;
  837. ADDPS XMM6, XMM1 ;
  838. MOVHLPS XMM6, XMM6 ;
  839. ADDSS XMM6, [RDI] ;
  840. MOVSS [RDI], XMM6 ;
  841. endL:
  842. ADD RSP, 40 ;
  843. END L1Block5RSSE;
  844. PROCEDURE -Align4( adr: ADDRESS ): ADDRESS;
  845. CODE {SYSTEM.AMD64}
  846. MOV RAX, [RSP+adr] ;
  847. NEG RAX ;
  848. AND RAX, 3H ;
  849. ADD RAX, [RSP+adr] ;
  850. ADD RSP, 8
  851. END Align4;
  852. PROCEDURE -Align2( adr: ADDRESS ): ADDRESS;
  853. CODE {SYSTEM.AMD64}
  854. MOV RAX, [RSP+adr] ;
  855. NEG RAX ;
  856. AND RAX, 1H ;
  857. ADD RAX, [RSP+adr] ;
  858. ADD RSP, 8
  859. END Align2;
  860. PROCEDURE -ZeroR( adr: ADDRESS; count: SIZE );
  861. (** For 32 bit types *)
  862. CODE {SYSTEM.AMD64}
  863. MOV RDI, [RSP+adr] ; address OF dest index
  864. MOV RCX, [RSP+count] ; counter
  865. MOV RAX, 0 ; value
  866. CLD ; incremental
  867. REP ;
  868. STOSD ;
  869. ADD RSP, 16 ;
  870. END ZeroR;
  871. PROCEDURE -ZeroX( adr: ADDRESS; count: SIZE );
  872. (** For 64 bit types *)
  873. CODE {SYSTEM.AMD64}
  874. MOV RDI, [RSP+adr] ; address OF dest index
  875. MOV RCX, [RSP+count] ; counter
  876. SHL RCX, 1 ;
  877. MOV RAX, 0 ; value
  878. CLD ; incremental
  879. REP ;
  880. STOSD ;
  881. ADD RSP, 16 ;
  882. END ZeroX;
  883. PROCEDURE -ZeroRI( adr: SIZE; inc, count: SIZE );
  884. (** For 32 bit types *)
  885. CODE {SYSTEM.AMD64}
  886. MOV RDI, [RSP+adr] ; address OF dest index
  887. MOV RBX, [RSP+inc] ;
  888. MOV RCX, [RSP+count] ; counter
  889. CMP RBX, 4 ;
  890. JE fastzero ;
  891. MOV RAX, 0 ;
  892. loopL:
  893. CMP RCX, 0 ;
  894. JLE endL ;
  895. MOV [RDI], RAX ;
  896. ADD RDI, RBX ;
  897. DEC RCX ;
  898. JMP loopL ;
  899. fastzero:
  900. MOV RAX, 0 ; value
  901. CLD ; incremental
  902. REP ;
  903. STOSD ;
  904. endL:
  905. ADD RSP, 24 ;
  906. END ZeroRI;
  907. PROCEDURE -ZeroXI( adr: ADDRESS; inc, count: SIZE );
  908. (** For 32 bit types *)
  909. CODE {SYSTEM.AMD64}
  910. MOV RDI, [RSP+adr] ; address OF dest index
  911. MOV RBX, [RSP+inc] ;
  912. MOV RCX, [RSP+count] ; counter
  913. MOV RAX, 0 ;
  914. CMP RBX, 8 ;
  915. JE fastzero ;
  916. loopL:
  917. CMP RCX, 0 ;
  918. JLE endL ;
  919. MOV [RDI], RAX ;
  920. MOV [RDI+4], RAX ;
  921. ADD RDI, RBX ;
  922. DEC RCX ;
  923. JMP loopL ;
  924. fastzero:
  925. SHL RCX, 1 ;
  926. CLD ; incremental
  927. REP ;
  928. STOSD ;
  929. endL:
  930. ADD RSP, 24 ;
  931. END ZeroXI;
  932. PROCEDURE -MovR( from, to0, frominc, count: SIZE );
  933. CODE {SYSTEM.AMD64}
  934. MOV RDI, [RSP+to0] ; TO
  935. MOV RSI, [RSP+from] ; from
  936. MOV RCX, [RSP+count] ; count
  937. MOV RBX, [RSP+frominc] ; inc
  938. CMP RBX, 4 ;
  939. JE fastmove ;
  940. loopL:
  941. CMP RCX, 0 ;
  942. JLE endL ;
  943. MOV RAX, [RSI] ;
  944. MOV [RDI], RAX ;
  945. ADD RSI, RBX ;
  946. ADD RDI, 4 ;
  947. DEC RCX ;
  948. JMP loopL ;
  949. fastmove:
  950. CLD ; incremental
  951. REP ;
  952. MOVSD ; move rest IN one byte steps
  953. endL:
  954. ADD RSP, 32 ;
  955. END MovR;
  956. PROCEDURE -MovX( from, to0: ADDRESS; frominc, count:SIZE );
  957. CODE {SYSTEM.AMD64}
  958. MOV RDI, [RSP+to0] ; TO
  959. MOV RSI, [RSP+from] ; from
  960. MOV RCX, [RSP+count] ; count
  961. MOV RBX, [RSP+frominc] ; inc
  962. CMP RBX, 8 ;
  963. JE fastmove ;
  964. loopL:
  965. CMP RCX, 0 ;
  966. JLE endL ;
  967. MOV RAX, [RSI] ;
  968. MOV [RDI], RAX ;
  969. MOV RAX, [RSI+4] ;
  970. MOV [RDI+4], RAX ;
  971. ADD RSI, RBX ;
  972. ADD RDI, 8 ;
  973. DEC RCX ;
  974. JMP loopL ;
  975. fastmove:
  976. SHL RCX, 1 ;
  977. CLD ; incremental
  978. REP ;
  979. MOVSD ; move rest IN one byte steps
  980. endL:
  981. ADD RSP, 32 ;
  982. END MovX;
  983. PROCEDURE -MovR5( src: ADDRESS; inc, stride: SIZE; dest: ADDRESS; count: SIZE);
  984. CODE {SYSTEM.AMD64}
  985. MOV RSI, [RSP+src] ; src
  986. MOV RBX, [RSP+inc] ; inc
  987. MOV RCX, [RSP+stride] ; stride
  988. MOV RDI, [RSP+dest] ; dest
  989. loopL:
  990. MOV RAX, [RSP+count] ; count
  991. CMP RAX, 0 ;
  992. JLE endL ;
  993. SUB RAX, 4 ;
  994. MOV [RSP+count], RAX ;
  995. MOV RDX, RSI ;
  996. MOV RAX, [RDX] ;
  997. MOV [RDI], RAX ;
  998. ADD RDX, RBX ;
  999. MOV RAX, [RDX] ;
  1000. MOV [RDI+16], RAX ;
  1001. ADD RDX, RBX ;
  1002. MOV RAX, [RDX] ;
  1003. MOV [RDI+32], RAX ;
  1004. ADD RDX, RBX ;
  1005. MOV RAX, [RDX] ;
  1006. MOV [RDI+48], RAX ;
  1007. ADD RDX, RBX ;
  1008. MOV RAX, [RDX] ;
  1009. MOV [RDI+64], RAX ;
  1010. ADD RSI, RCX ;
  1011. ADD RDI, 4 ;
  1012. MOV RDX, RSI ;
  1013. MOV RAX, [RDX] ;
  1014. MOV [RDI], RAX ;
  1015. ADD RDX, RBX ;
  1016. MOV RAX, [RDX] ;
  1017. MOV [RDI+16], RAX ;
  1018. ADD RDX, RBX ;
  1019. MOV RAX, [RDX] ;
  1020. MOV [RDI+32], RAX ;
  1021. ADD RDX, RBX ;
  1022. MOV RAX, [RDX] ;
  1023. MOV [RDI+48], RAX ;
  1024. ADD RDX, RBX ;
  1025. MOV RAX, [RDX] ;
  1026. MOV [RDI+64], RAX ;
  1027. ADD RSI, RCX ;
  1028. ADD RDI, 4 ;
  1029. MOV RDX, RSI ;
  1030. MOV RAX, [RDX] ;
  1031. MOV [RDI], RAX ;
  1032. ADD RDX, RBX ;
  1033. MOV RAX, [RDX] ;
  1034. MOV [RDI+16], RAX ;
  1035. ADD RDX, RBX ;
  1036. MOV RAX, [RDX] ;
  1037. MOV [RDI+32], RAX ;
  1038. ADD RDX, RBX ;
  1039. MOV RAX, [RDX] ;
  1040. MOV [RDI+48], RAX ;
  1041. ADD RDX, RBX ;
  1042. MOV RAX, [RDX] ;
  1043. MOV [RDI+64], RAX ;
  1044. ADD RSI, RCX ;
  1045. ADD RDI, 4 ;
  1046. MOV RDX, RSI ;
  1047. MOV RAX, [RDX] ;
  1048. MOV [RDI], RAX ;
  1049. ADD RDX, RBX ;
  1050. MOV RAX, [RDX] ;
  1051. MOV [RDI+16], RAX ;
  1052. ADD RDX, RBX ;
  1053. MOV RAX, [RDX] ;
  1054. MOV [RDI+32], RAX ;
  1055. ADD RDX, RBX ;
  1056. MOV RAX, [RDX] ;
  1057. MOV [RDI+48], RAX ;
  1058. ADD RDX, RBX ;
  1059. MOV RAX, [RDX] ;
  1060. MOV [RDI+64], RAX ;
  1061. ADD RSI, RCX ;
  1062. ADD RDI, 4 ;
  1063. ADD RDI, 64 ;
  1064. JMP loopL ;
  1065. endL:
  1066. ADD RSP, 40 ;
  1067. END MovR5;
  1068. (* *)
  1069. PROCEDURE AddAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1070. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1071. MOV RAX, [RBP+len] ;
  1072. MOV RBX, [RBP+ladr] ;
  1073. MOV RCX, [RBP+radr] ;
  1074. MOV RDX, [RBP+dadr] ;
  1075. start:
  1076. CMP RAX, 0 ;
  1077. JLE endL ;
  1078. FLD QWORD [RBX] ;
  1079. ADD RBX, [RBP+linc] ;
  1080. FLD QWORD [RCX] ;
  1081. ADD RCX, [RBP+rinc] ;
  1082. FADDP ;
  1083. FSTP QWORD [RDX] ;
  1084. ADD RDX, [RBP+dinc] ;
  1085. DEC RAX ;
  1086. JMP start ;
  1087. endL:
  1088. FWAIT ;
  1089. END AddAXAXLoopA;
  1090. PROCEDURE AddARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1091. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1092. MOV RAX, [RBP+len] ;
  1093. MOV RBX, [RBP+ladr] ;
  1094. MOV RCX, [RBP+radr] ;
  1095. MOV RDX, [RBP+dadr] ;
  1096. start:
  1097. CMP RAX, 0 ;
  1098. JLE endL ;
  1099. FLD DWORD [RBX] ;
  1100. ADD RBX, [RBP+linc] ;
  1101. FLD DWORD [RCX] ;
  1102. ADD RCX, [RBP+rinc] ;
  1103. FADDP ;
  1104. FSTP DWORD [RDX] ;
  1105. ADD RDX, [RBP+dinc] ;
  1106. DEC RAX ;
  1107. JMP start ;
  1108. endL:
  1109. FWAIT ;
  1110. END AddARARLoopA;
  1111. PROCEDURE AddAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1112. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1113. MOV RAX, [RBP+len] ;
  1114. CMP RAX, 0 ;
  1115. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1116. MOV RBX, [RBP+ladr] ;
  1117. MOV RCX, [RBP+radr] ;
  1118. MOV RDX, [RBP+dadr] ;
  1119. ; check IF data are contiguous IN memory
  1120. CMP [RBP+linc], 8 ; check left FOR contiunuity
  1121. JNE single ; not continuous- > simplest method
  1122. CMP [RBP+rinc], 8 ; check right FOR contiunuity
  1123. JNE single ; not continuous- > simplest method
  1124. CMP [RBP+dinc], 8 ; check destination FOR contiunuity
  1125. JNE single ; not continuous- > simplest method
  1126. ; check FOR alignment
  1127. MOV RSI, RBX ;
  1128. AND RSI, 7 ; ladr MOD 8
  1129. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1130. JNE unaligned ; not 64 bit aligned
  1131. MOV RSI, RCX ;
  1132. AND RSI, 7 ; radr MOD 8
  1133. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1134. JNE unaligned ; not 64 bit aligned
  1135. MOV RSI, RDX ;
  1136. AND RSI, 7 ; dadr MOD 8
  1137. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1138. JNE unaligned ; not 64 bit aligned
  1139. MOV RSI, RBX ;
  1140. AND RSI, 8 ; 16 byte alignment
  1141. MOV RDI, RCX ;
  1142. AND RDI, 8 ; 16 byte alignment
  1143. CMP RSI, RDI ;
  1144. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1145. MOV RDI, RDX ;
  1146. AND RDI, 8 ; 16 byte alignment
  1147. CMP RSI, RDI ;
  1148. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1149. CMP RSI, 8 ;
  1150. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1151. ; one single element processing TO achieve 128 bt alignment
  1152. MOVSD XMM1, [RBX] ;
  1153. MOVSD XMM0, [RCX] ;
  1154. ADDSD XMM0, XMM1 ;
  1155. MOVSD [RDX], XMM0 ;
  1156. ADD RBX, 8 ; now RBX IS 16 byte aligned
  1157. ADD RCX, 8 ; now RDX IS 16 byte aligned ;
  1158. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  1159. DEC RAX ; one element has been processed
  1160. aligned:
  1161. aligned8:
  1162. CMP RAX, 8 ;
  1163. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1164. MOVAPD XMM0, [RBX] ;
  1165. MOVAPD XMM1, [RBX+16] ;
  1166. MOVAPD XMM2, [RBX+32] ;
  1167. MOVAPD XMM3, [RBX+48] ;
  1168. ADD RBX, 64 ;
  1169. MOVAPD XMM4, [RCX] ;
  1170. MOVAPD XMM5, [RCX+16] ;
  1171. MOVAPD XMM6, [RCX+32] ;
  1172. MOVAPD XMM7, [RCX+48] ;
  1173. ADD RCX, 64 ;
  1174. ADDPD XMM0, XMM4 ;
  1175. ADDPD XMM1, XMM5 ;
  1176. ADDPD XMM2, XMM6 ;
  1177. ADDPD XMM3, XMM7 ;
  1178. MOVAPD [RDX], XMM0 ;
  1179. MOVAPD [RDX+16], XMM1 ;
  1180. MOVAPD [RDX+32], XMM2 ;
  1181. MOVAPD [RDX+48], XMM3 ;
  1182. ADD RDX, 64 ;
  1183. SUB RAX, 8 ;
  1184. JMP aligned8 ;
  1185. ; LOOP FOR 2 pieces aligned
  1186. aligned2: ;
  1187. CMP RAX, 2 ;
  1188. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1189. MOVAPD XMM0, [RBX] ;
  1190. ADD RBX, 16 ;
  1191. MOVAPD XMM1, [RCX] ;
  1192. ADD RCX, 16 ;
  1193. ADDPD XMM0, XMM1 ;
  1194. MOVAPD [RDX], XMM0 ;
  1195. ADD RDX, 16 ;
  1196. SUB RAX, 2 ;
  1197. JMP aligned2 ;
  1198. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1199. unaligned: ;
  1200. unaligned8: ;
  1201. CMP RAX, 8 ;
  1202. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1203. MOVUPD XMM0, [RBX] ;
  1204. MOVUPD XMM1, [RBX+16] ;
  1205. MOVUPD XMM2, [RBX+32] ;
  1206. MOVUPD XMM3, [RBX+48] ;
  1207. ADD RBX, 64 ;
  1208. MOVUPD XMM4, [RCX] ;
  1209. MOVUPD XMM5, [RCX+16] ;
  1210. MOVUPD XMM6, [RCX+32] ;
  1211. MOVUPD XMM7, [RCX+48] ;
  1212. ADD RCX, 64 ;
  1213. ADDPD XMM0, XMM4 ;
  1214. ADDPD XMM1, XMM5 ;
  1215. ADDPD XMM2, XMM6 ;
  1216. ADDPD XMM3, XMM7 ;
  1217. MOVUPD [RDX], XMM0 ;
  1218. MOVUPD [RDX+16], XMM1 ;
  1219. MOVUPD [RDX+32], XMM2 ;
  1220. MOVUPD [RDX+48], XMM3 ;
  1221. ADD RDX, 64 ;
  1222. SUB RAX, 8 ;
  1223. JMP unaligned8 ;
  1224. ; LOOP FOR 2 pieces aligned
  1225. unaligned2: ;
  1226. CMP RAX, 2 ;
  1227. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1228. MOVUPD XMM0, [RBX] ;
  1229. ADD RBX, 16 ;
  1230. MOVUPD XMM1, [RCX] ;
  1231. ADD RCX, 16 ;
  1232. ADDPD XMM0, XMM1 ;
  1233. MOVUPD [RDX], XMM0 ;
  1234. ADD RDX, 16 ;
  1235. SUB RAX, 2 ;
  1236. JMP unaligned2 ;
  1237. ; one piece left OR non-contiguous data
  1238. single:
  1239. singlepieces: ;
  1240. CMP RAX, 0 ;
  1241. JLE endL ; len <= 0- > EXIT
  1242. MOVSD XMM0, [RBX]
  1243. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1244. MOVSD XMM1, [RCX]
  1245. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1246. ADDSD XMM0, XMM1 ;
  1247. MOVSD [RDX], XMM0
  1248. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1249. DEC RAX ; DEC(len)
  1250. JMP singlepieces ;
  1251. endL:
  1252. END AddAXAXLoopSSE;
  1253. PROCEDURE AddARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1254. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1255. MOV RAX, [RBP+len] ;
  1256. CMP RAX, 0 ;
  1257. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1258. MOV RBX, [RBP+ladr] ;
  1259. MOV RCX, [RBP+radr] ;
  1260. MOV RDX, [RBP+dadr] ;
  1261. ; check IF data are contiguous IN memory
  1262. CMP [RBP+linc], 4 ; check left FOR contiunuity
  1263. JNE single ; not continuous- > simplest method
  1264. CMP [RBP+rinc], 4 ; check right FOR contiunuity
  1265. JNE single ; not continuous- > simplest method
  1266. CMP [RBP+dinc], 4 ; check destination FOR contiunuity
  1267. JNE single ; not continuous- > simplest method
  1268. ; check FOR alignment
  1269. MOV RSI, RBX ;
  1270. AND RSI, 3 ; ladr MOD 4
  1271. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1272. JNE unaligned ; not 32 bit aligned
  1273. MOV RSI, RCX ;
  1274. AND RSI, 3 ; radr MOD 4
  1275. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1276. JNE unaligned ; not 32 bit aligned
  1277. MOV RSI, RDX ;
  1278. AND RSI, 3 ; dadr MOD 4
  1279. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1280. JNE unaligned ; not 32 bit aligned
  1281. MOV RSI, RBX ;
  1282. AND RSI, 8+4 ; 16 byte alignment?
  1283. MOV RDI, RCX ;
  1284. AND RDI, 8+4 ; 16 byte alignment?
  1285. CMP RSI, RDI ;
  1286. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1287. MOV RDI, RDX ;
  1288. AND RDI, 8+4 ; 16 byte alignment
  1289. CMP RSI, RDI ;
  1290. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1291. CMP RSI, 0 ;
  1292. JE aligned ; already aligned
  1293. align:
  1294. ; one single element processing UNTIL 128 bt alignment achieved
  1295. MOVSS XMM1, [RBX] ;
  1296. MOVSS XMM0, [RCX] ;
  1297. ADDSS XMM0, XMM1 ;
  1298. MOVSS [RDX], XMM0 ;
  1299. ADD RBX, 4 ;
  1300. ADD RCX, 4 ;
  1301. ADD RDX, 4 ;
  1302. DEC RAX ; one element has been processed ;
  1303. CMP RAX, 0 ; all elements already processed?
  1304. JLE single ;
  1305. MOV RSI, RBX ;
  1306. AND RSI, 8+4 ;
  1307. CMP RSI, 0 ;
  1308. JNE align ;
  1309. aligned:
  1310. aligned16:
  1311. CMP RAX, 16 ;
  1312. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1313. MOVAPS XMM0, [RBX] ;
  1314. MOVAPS XMM1, [RBX+16] ;
  1315. MOVAPS XMM2, [RBX+32] ;
  1316. MOVAPS XMM3, [RBX+48] ;
  1317. ADD RBX, 64 ;
  1318. MOVAPS XMM4, [RCX] ;
  1319. MOVAPS XMM5, [RCX+16] ;
  1320. MOVAPS XMM6, [RCX+32] ;
  1321. MOVAPS XMM7, [RCX+48] ;
  1322. ADD RCX, 64 ;
  1323. ADDPS XMM0, XMM4 ;
  1324. ADDPS XMM1, XMM5 ;
  1325. ADDPS XMM2, XMM6 ;
  1326. ADDPS XMM3, XMM7 ;
  1327. MOVAPS [RDX], XMM0 ;
  1328. MOVAPS [RDX+16], XMM1 ;
  1329. MOVAPS [RDX+32], XMM2 ;
  1330. MOVAPS [RDX+48], XMM3 ;
  1331. ADD RDX, 64 ;
  1332. SUB RAX, 16 ;
  1333. JMP aligned16 ;
  1334. ; LOOP FOR 2 pieces aligned
  1335. aligned4: ;
  1336. CMP RAX, 4 ;
  1337. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1338. MOVAPS XMM0, [RBX] ;
  1339. ADD RBX, 16 ;
  1340. MOVAPS XMM1, [RCX] ;
  1341. ADD RCX, 16 ;
  1342. ADDPS XMM0, XMM1 ;
  1343. MOVAPS [RDX], XMM0 ;
  1344. ADD RDX, 16 ;
  1345. SUB RAX, 4 ;
  1346. JMP aligned4 ;
  1347. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1348. unaligned: ;
  1349. unaligned16: ;
  1350. CMP RAX, 16 ;
  1351. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1352. MOVUPS XMM0, [RBX] ;
  1353. MOVUPS XMM1, [RBX+16] ;
  1354. MOVUPS XMM2, [RBX+32] ;
  1355. MOVUPS XMM3, [RBX+48] ;
  1356. ADD RBX, 64 ;
  1357. MOVUPS XMM4, [RCX] ;
  1358. MOVUPS XMM5, [RCX+16] ;
  1359. MOVUPS XMM6, [RCX+32] ;
  1360. MOVUPS XMM7, [RCX+48] ;
  1361. ADD RCX, 64 ;
  1362. ADDPS XMM0, XMM4 ;
  1363. ADDPS XMM1, XMM5 ;
  1364. ADDPS XMM2, XMM6 ;
  1365. ADDPS XMM3, XMM7 ;
  1366. MOVUPS [RDX], XMM0 ;
  1367. MOVUPS [RDX+16], XMM1 ;
  1368. MOVUPS [RDX+32], XMM2 ;
  1369. MOVUPS [RDX+48], XMM3 ;
  1370. ADD RDX, 64 ;
  1371. SUB RAX, 16 ;
  1372. JMP unaligned16 ;
  1373. ; LOOP FOR 2 pieces aligned
  1374. unaligned4: ;
  1375. CMP RAX, 4 ;
  1376. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1377. MOVUPS XMM0, [RBX] ;
  1378. ADD RBX, 16 ;
  1379. MOVUPS XMM1, [RCX] ;
  1380. ADD RCX, 16 ;
  1381. ADDPS XMM0, XMM1 ;
  1382. MOVUPS [RDX], XMM0 ;
  1383. ADD RDX, 16 ;
  1384. SUB RAX, 4 ;
  1385. JMP unaligned4 ;
  1386. ; one piece left OR non-contiguous data
  1387. single:
  1388. singlepieces: ;
  1389. CMP RAX, 0 ;
  1390. JLE endL ; len <= 0- > EXIT
  1391. MOVSS XMM0, [RBX]
  1392. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1393. MOVSS XMM1, [RCX]
  1394. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1395. ADDSS XMM0, XMM1 ;
  1396. MOVSS [RDX], XMM0
  1397. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1398. DEC RAX ; DEC(len)
  1399. JMP singlepieces ;
  1400. endL:
  1401. END AddARARLoopSSE;
  1402. PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1403. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1404. MOV RAX, [RBP+len] ; eax := len
  1405. MOV RBX, [RBP+ladr] ; ebx := ladr
  1406. MOV RCX, [RBP+radr] ; ecx := radr
  1407. MOV RDX, [RBP+dadr] ; edx := dadr
  1408. FLD QWORD [RDX] ; S.GET(dadr, x)
  1409. start:
  1410. CMP RAX, 0 ; WHILE len > 0 DO
  1411. JLE endL
  1412. FLD QWORD [RBX] ; S.GET(ladr, x)
  1413. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1414. FLD QWORD [RCX] ; S.GET(ladr, y)
  1415. FMULP ; x := x*y
  1416. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  1417. FADDP ; z := z+x
  1418. DEC RAX ; DEC(len)
  1419. JMP start ;
  1420. endL:
  1421. FSTP QWORD [RDX] ; S.PUT(dadr, x)
  1422. FWAIT ;
  1423. END SPAXAXLoopA;
  1424. PROCEDURE SPARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1425. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1426. MOV RAX, [RBP+len] ; eax := len
  1427. MOV RBX, [RBP+ladr] ; ebx := ladr
  1428. MOV RCX, [RBP+radr] ; ecx := radr
  1429. MOV RDX, [RBP+dadr] ; edx := dadr
  1430. FLD DWORD [RDX] ; S.GET(dadr, x)
  1431. start:
  1432. CMP RAX, 0 ; WHILE len > 0 DO
  1433. JLE endL
  1434. FLD DWORD [RBX] ; S.GET(ladr, x)
  1435. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1436. FLD DWORD [RCX] ; S.GET(ladr, y)
  1437. FMULP ; x := x*y
  1438. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  1439. FADDP ; z := z+x
  1440. DEC RAX ; DEC(len)
  1441. JMP start ;
  1442. endL:
  1443. FSTP DWORD [RDX] ; S.PUT(dadr, x)
  1444. FWAIT ;
  1445. END SPARARLoopA;
  1446. (* sse version of scalar product *)
  1447. PROCEDURE SPAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1448. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1449. ; register initialization
  1450. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  1451. CMP RAX, 0 ;
  1452. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1453. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  1454. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  1455. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  1456. XORPD XMM0, XMM0 ;
  1457. MOVSD XMM0, [RDX] ; destination- > low bytes OF xmm0
  1458. CMP [RBP+linc], 8 ; check left FOR contiunuity
  1459. JNE single ; not continuous- > simplest method
  1460. CMP [RBP+rinc], 8 ; check dest FOR continuity
  1461. JNE single ; not continuous- > simplest method
  1462. ; check FOR alignment
  1463. MOV RSI, RBX ;
  1464. AND RSI, 7 ; ladr MOD 8
  1465. CMP RSI, 0 ; RCX = 0- > 64 Bit alignment
  1466. JNE unaligned ; not 64 bit aligned
  1467. MOV RSI, RCX ;
  1468. AND RSI, 7 ; radr MOD 8
  1469. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1470. JNE unaligned ; not 64 bit aligned
  1471. MOV RSI, RBX ;
  1472. AND RSI, 8 ; 16 byte alignment
  1473. MOV RDI, RCX ;
  1474. AND RDI, 8 ; 16 byte alignment
  1475. CMP RSI, RDI ;
  1476. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1477. CMP RSI, 8 ;
  1478. JNE aligned ; ladr and dadr already 128 bit aligned
  1479. ; one single element processing TO achieve 128 bt alignment
  1480. MOVSD XMM1, [RBX] ;
  1481. MOVSD XMM2, [RCX] ;
  1482. MULSD XMM1, XMM2 ;
  1483. ADDSD XMM0, XMM1 ;
  1484. ADD RBX, 8 ; now RBX IS 16 byte aligned
  1485. ADD RCX, 8 ; now RDX IS 16 byte aligned ;
  1486. DEC RAX ; one element has been processed
  1487. ; LOOP FOR 4 pieces aligned
  1488. aligned:
  1489. aligned6:
  1490. CMP RAX, 6 ;
  1491. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1492. MOVAPD XMM1, [RBX] ;
  1493. MOVAPD XMM2, [RBX+16] ;
  1494. MOVAPD XMM3, [RBX+32] ;
  1495. MOVAPD XMM4, [RCX] ;
  1496. MOVAPD XMM5, [RCX+16] ;
  1497. MOVAPD XMM6, [RCX+32] ;
  1498. MULPD XMM1, XMM4 ;
  1499. ADDPD XMM0, XMM1 ;
  1500. MULPD XMM2, XMM5 ;
  1501. ADDPD XMM0, XMM2 ;
  1502. MULPD XMM3, XMM6 ;
  1503. ADDPD XMM0, XMM3 ;
  1504. ADD RBX, 48 ;
  1505. ADD RCX, 48 ;
  1506. SUB RAX, 6 ;
  1507. JMP aligned6 ;
  1508. ; LOOP FOR 2 pieces aligned
  1509. aligned2:
  1510. CMP RAX, 2 ;
  1511. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1512. MOVAPD XMM1, [RBX] ;
  1513. MOVAPD XMM2, [RCX] ;
  1514. MULPD XMM1, XMM2 ;
  1515. ADDPD XMM0, XMM1 ;
  1516. ADD RBX, 16 ;
  1517. ADD RCX, 16 ;
  1518. SUB RAX, 2 ;
  1519. JMP aligned2 ;
  1520. unaligned:
  1521. unaligned6:
  1522. CMP RAX, 6 ;
  1523. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1524. MOVUPD XMM1, [RBX] ;
  1525. MOVUPD XMM2, [RBX+16] ;
  1526. MOVUPD XMM3, [RBX+32] ;
  1527. MOVUPD XMM4, [RCX] ;
  1528. MOVUPD XMM5, [RCX+16] ;
  1529. MOVUPD XMM6, [RCX+32] ;
  1530. MULPD XMM1, XMM4 ;
  1531. ADDPD XMM0, XMM1 ;
  1532. MULPD XMM2, XMM5 ;
  1533. ADDPD XMM0, XMM2 ;
  1534. MULPD XMM3, XMM6 ;
  1535. ADDPD XMM0, XMM3 ;
  1536. ADD RBX, 48 ;
  1537. ADD RCX, 48 ;
  1538. SUB RAX, 6 ;
  1539. JMP unaligned6 ;
  1540. ; LOOP FOR 2 pieces aligned
  1541. unaligned2:
  1542. CMP RAX, 2 ;
  1543. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1544. MOVUPD XMM1, [RBX] ;
  1545. MOVUPD XMM2, [RCX] ;
  1546. MULPD XMM1, XMM2 ;
  1547. ADDPD XMM0, XMM1 ;
  1548. ADD RBX, 16 ;
  1549. ADD RCX, 16 ;
  1550. SUB RAX, 2 ;
  1551. JMP unaligned2 ;
  1552. horizontaladd: ;
  1553. MOVAPD XMM1, XMM0 ;
  1554. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  1555. ADDPD XMM0, XMM1 ;
  1556. JMP singlepieces ;
  1557. single:
  1558. singlepieces: ;
  1559. CMP RAX, 0 ;
  1560. JLE store ; len <= 0- > EXIT
  1561. MOVSD XMM1, [RBX]
  1562. MOVSD XMM2, [RCX]
  1563. MULSD XMM1, XMM2
  1564. ADDSD XMM0, XMM1
  1565. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1566. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  1567. DEC RAX ; DEC(len)
  1568. JMP singlepieces ;
  1569. store:
  1570. MOVSD [RDX], XMM0 ;
  1571. endL:
  1572. END SPAXAXLoopSSE;
  1573. (* sse version of scalar product *)
  1574. PROCEDURE SPARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1575. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  1576. ; register initialization
  1577. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  1578. CMP RAX, 0 ;
  1579. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1580. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  1581. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  1582. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  1583. XORPS XMM0, XMM0 ;
  1584. MOVSS XMM0, [RDX] ; destination- > low bytes OF xmm0
  1585. CMP [RBP+linc], 4 ; check left FOR contiunuity
  1586. JNE single ; not continuous- > simplest method
  1587. CMP [RBP+rinc], 4 ; check dest FOR continuity
  1588. JNE single ; not continuous- > simplest method
  1589. ; check FOR alignment
  1590. MOV RSI, RBX ;
  1591. AND RSI, 3 ; ladr MOD 4
  1592. CMP RSI, 0 ; RCX = 0- > 32 Bit alignment
  1593. JNE unaligned ; not 32 bit aligned
  1594. MOV RSI, RCX ;
  1595. AND RSI, 3 ; radr MOD 4
  1596. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1597. JNE unaligned ; not 32 bit aligned
  1598. MOV RSI, RBX ;
  1599. AND RSI, 8+4 ; 16 byte alignment
  1600. MOV RDI, RCX ;
  1601. AND RDI, 8+4 ; 16 byte alignment
  1602. CMP RSI, RDI ;
  1603. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1604. CMP RSI, 0 ;
  1605. JE aligned ; already aligned
  1606. align:
  1607. ; one single element processing UNTIL 128 bt alignment achieved
  1608. MOVSS XMM1, [RBX] ;
  1609. MOVSS XMM2, [RCX] ;
  1610. MULSS XMM1, XMM2 ;
  1611. ADDSS XMM0, XMM1 ;
  1612. ADD RBX, 4 ;
  1613. ADD RCX, 4 ;
  1614. DEC RAX ; one element has been processed ;
  1615. CMP RAX, 0 ; all elements already processed?
  1616. JLE single ;
  1617. MOV RSI, RBX ;
  1618. AND RSI, 8+4 ;
  1619. CMP RSI, 0 ;
  1620. JNE align ;
  1621. aligned:
  1622. aligned12:
  1623. CMP RAX, 12 ;
  1624. JL aligned4 ; len < 4- > EXIT TO singlepieces
  1625. MOVAPS XMM1, [RBX] ;
  1626. MOVAPS XMM2, [RBX+16] ;
  1627. MOVAPS XMM3, [RBX+32] ;
  1628. MOVAPS XMM4, [RCX] ;
  1629. MOVAPS XMM5, [RCX+16] ;
  1630. MOVAPS XMM6, [RCX+32] ;
  1631. MULPS XMM1, XMM4 ;
  1632. ADDPS XMM0, XMM1 ;
  1633. MULPS XMM2, XMM5 ;
  1634. ADDPS XMM0, XMM2 ;
  1635. MULPS XMM3, XMM6 ;
  1636. ADDPS XMM0, XMM3 ;
  1637. ADD RBX, 48 ;
  1638. ADD RCX, 48 ;
  1639. SUB RAX, 12 ;
  1640. JMP aligned12 ;
  1641. ; LOOP FOR 2 pieces aligned
  1642. aligned4:
  1643. CMP RAX, 4 ;
  1644. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1645. MOVAPS XMM1, [RBX] ;
  1646. MOVAPS XMM2, [RCX] ;
  1647. MULPS XMM1, XMM2 ;
  1648. ADDPS XMM0, XMM1 ;
  1649. ADD RBX, 16 ;
  1650. ADD RCX, 16 ;
  1651. SUB RAX, 4 ;
  1652. JMP aligned4 ;
  1653. unaligned:
  1654. unaligned12:
  1655. CMP RAX, 12 ;
  1656. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1657. MOVUPS XMM1, [RBX] ;
  1658. MOVUPS XMM2, [RBX+16] ;
  1659. MOVUPS XMM3, [RBX+32] ;
  1660. MOVUPS XMM4, [RCX] ;
  1661. MOVUPS XMM5, [RCX+16] ;
  1662. MOVUPS XMM6, [RCX+32] ;
  1663. MULPS XMM1, XMM4 ;
  1664. ADDPS XMM0, XMM1 ;
  1665. MULPS XMM2, XMM5 ;
  1666. ADDPS XMM0, XMM2 ;
  1667. MULPS XMM3, XMM6 ;
  1668. ADDPS XMM0, XMM3 ;
  1669. ADD RBX, 48 ;
  1670. ADD RCX, 48 ;
  1671. SUB RAX, 12 ;
  1672. JMP unaligned12 ;
  1673. ; LOOP FOR 2 pieces aligned
  1674. unaligned4:
  1675. CMP RAX, 4 ;
  1676. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1677. MOVUPS XMM1, [RBX] ;
  1678. MOVUPS XMM2, [RCX] ;
  1679. MULPS XMM1, XMM2 ;
  1680. ADDPS XMM0, XMM1 ;
  1681. ADD RBX, 16 ;
  1682. ADD RCX, 16 ;
  1683. SUB RAX, 4 ;
  1684. JMP unaligned4 ;
  1685. horizontaladd: ;
  1686. MOVAPS XMM1, XMM0 ;
  1687. ; 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *)
  1688. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  1689. ADDPS XMM1, XMM0 ;
  1690. MOVAPS XMM0, XMM1
  1691. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  1692. ADDPS XMM0, XMM1 ;
  1693. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  1694. JMP singlepieces ;
  1695. single:
  1696. singlepieces: ;
  1697. CMP RAX, 0 ;
  1698. JLE store ; len <= 0- > EXIT
  1699. MOVSS XMM1, [RBX]
  1700. MOVSS XMM2, [RCX]
  1701. MULSS XMM1, XMM2
  1702. ADDSS XMM0, XMM1
  1703. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1704. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  1705. DEC RAX ; DEC(len)
  1706. JMP singlepieces ;
  1707. store:
  1708. MOVSS [RDX], XMM0 ;
  1709. endL:
  1710. END SPARARLoopSSE;
  1711. PROCEDURE MulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1712. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1713. MOV RAX, [RBP+len] ; eax := len
  1714. MOV RBX, [RBP+ladr] ; ebx := ladr
  1715. MOV RCX, [RBP+radr] ; ecx := radr
  1716. MOV RDX, [RBP+dadr] ; edx := dadr
  1717. start:
  1718. CMP RAX, 0 ; WHILE len > 0 DO
  1719. JLE endL
  1720. FLD QWORD [RBX] ; S.GET(ladr, x)
  1721. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1722. FLD QWORD [RCX] ; S.GET(ladr, y)
  1723. FMULP ; x := x*y
  1724. FSTP QWORD [RDX]
  1725. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1726. DEC RAX ; DEC(len)
  1727. JMP start ;
  1728. endL:
  1729. FWAIT ;
  1730. END MulAXSXLoopA;
  1731. PROCEDURE MulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1732. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1733. MOV RAX, [RBP+len] ; eax := len
  1734. MOV RBX, [RBP+ladr] ; ebx := ladr
  1735. MOV RCX, [RBP+radr] ; ecx := radr
  1736. MOV RDX, [RBP+dadr] ; edx := dadr
  1737. start:
  1738. CMP RAX, 0 ; WHILE len > 0 DO
  1739. JLE endL
  1740. FLD DWORD [RBX] ; S.GET(ladr, x)
  1741. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1742. FLD DWORD [RCX] ; S.GET(ladr, y)
  1743. FMULP ; x := x*y
  1744. FSTP DWORD [RDX]
  1745. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1746. DEC RAX ; DEC(len)
  1747. JMP start ;
  1748. endL:
  1749. FWAIT ;
  1750. END MulARSRLoopA;
  1751. PROCEDURE IncMulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1752. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1753. MOV RAX, [RBP+len] ; eax := len
  1754. MOV RBX, [RBP+ladr] ; ebx := ladr
  1755. MOV RCX, [RBP+radr] ; ecx := radr
  1756. MOV RDX, [RBP+dadr] ; edx := dadr
  1757. start:
  1758. CMP RAX, 0 ; WHILE len > 0 DO
  1759. JLE endL
  1760. FLD QWORD [RBX] ; S.GET(ladr, x)
  1761. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1762. FLD QWORD [RCX] ; S.GET(ladr, y)
  1763. FMULP ; x := x*y
  1764. FLD QWORD [RDX+8] ;
  1765. FADDP ;
  1766. FSTP QWORD [RDX]
  1767. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1768. DEC RAX ; DEC(len)
  1769. JMP start ;
  1770. endL:
  1771. FWAIT ;
  1772. END IncMulAXSXLoopA;
  1773. PROCEDURE IncMulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1774. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1775. MOV RAX, [RBP+len] ; eax := len
  1776. MOV RBX, [RBP+ladr] ; ebx := ladr
  1777. MOV RCX, [RBP+radr] ; ecx := radr
  1778. MOV RDX, [RBP+dadr] ; edx := dadr
  1779. start:
  1780. CMP RAX, 0 ; WHILE len > 0 DO
  1781. JLE endL
  1782. FLD DWORD [RBX] ; S.GET(ladr, x)
  1783. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1784. FLD DWORD [RCX] ; S.GET(ladr, y)
  1785. FMULP ; x := x*y
  1786. FLD DWORD [RDX+8] ;
  1787. FADDP ;
  1788. FSTP DWORD [RDX]
  1789. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1790. DEC RAX ; DEC(len)
  1791. JMP start ;
  1792. endL:
  1793. FWAIT ;
  1794. END IncMulARSRLoopA;
  1795. PROCEDURE MulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1796. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  1797. (*
  1798. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  1799. 2.) process starting unaligned data ( using single instructions)
  1800. 3.) process aligned data
  1801. 4.) process remaining unaligned data (using single instructions)
  1802. *)
  1803. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1804. ; register initialization
  1805. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  1806. CMP RAX, 0 ;
  1807. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1808. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  1809. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  1810. MOV RCX, [RBP+radr] ;
  1811. MOVSD XMM0, [RCX] ;
  1812. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  1813. ; check IF data are contiguous IN memory
  1814. CMP [RBP+linc], 8 ; check left FOR contiunuity
  1815. JNE single ; not continuous- > simplest method
  1816. CMP [RBP+dinc], 8 ; check dest FOR continuity
  1817. JNE single ; not continuous- > simplest method
  1818. ; check FOR alignment
  1819. MOV RCX, RBX ;
  1820. AND RCX, 7 ; ladr MOD 8
  1821. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  1822. JNE unaligned ; not 64 bit aligned
  1823. MOV RCX, RDX ;
  1824. AND RCX, 7 ; dadr MOD 8
  1825. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  1826. JNE unaligned ; not 64 bit aligned
  1827. MOV RSI, RBX ;
  1828. AND RSI, 8 ; 16 byte alignment
  1829. MOV RDI, RDX ;
  1830. AND RDI, 8 ; 16 byte alignment
  1831. CMP RSI, RDI ;
  1832. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1833. CMP RSI, 8 ;
  1834. JNE aligned ; ladr and dadr already 128 bit aligned
  1835. ; one single element processing TO achieve 128 bt alignment
  1836. MOVSD XMM1, [RBX] ;
  1837. MULSD XMM1, XMM0 ;
  1838. MOVSD [RDX], XMM1 ;
  1839. ADD RBX, 8 ; now RBX IS 16 byte aligned
  1840. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  1841. DEC RAX ; one element has been processed
  1842. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  1843. aligned:
  1844. aligned8:
  1845. CMP RAX, 8 ;
  1846. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1847. MOVAPD XMM1, [RBX] ;
  1848. MOVAPD XMM2, [RBX+16] ;
  1849. MOVAPD XMM3, [RBX+32] ;
  1850. MOVAPD XMM4, [RBX+48] ;
  1851. ADD RBX, 64 ;
  1852. MULPD XMM1, XMM0 ;
  1853. MULPD XMM2, XMM0 ;
  1854. MULPD XMM3, XMM0 ;
  1855. MULPD XMM4, XMM0 ;
  1856. MOVAPD [RDX], XMM1 ;
  1857. MOVAPD [RDX+16], XMM2 ;
  1858. MOVAPD [RDX+32], XMM3 ;
  1859. MOVAPD [RDX+48], XMM4 ;
  1860. ADD RDX, 64 ;
  1861. SUB RAX, 8 ;
  1862. JMP aligned8 ;
  1863. ; LOOP FOR 2 pieces aligned
  1864. aligned2: ;
  1865. CMP RAX, 2 ;
  1866. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1867. MOVAPD XMM1, [RBX] ;
  1868. ADD RBX, 16 ;
  1869. MULPD XMM1, XMM0 ;
  1870. MOVAPD [RDX], XMM1 ;
  1871. ADD RDX, 16 ;
  1872. SUB RAX, 2 ;
  1873. JMP aligned2 ;
  1874. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1875. unaligned: ;
  1876. unaligned8: ;
  1877. CMP RAX, 8 ;
  1878. JL unaligned2 ; len < 12- > EXIT
  1879. MOVUPD XMM1, [RBX] ;
  1880. MOVUPD XMM2, [RBX+16] ;
  1881. MOVUPD XMM3, [RBX+32] ;
  1882. MOVUPD XMM4, [RBX+48] ;
  1883. ADD RBX, 64
  1884. MULPD XMM1, XMM0 ;
  1885. MULPD XMM2, XMM0 ;
  1886. MULPD XMM3, XMM0 ;
  1887. MULPD XMM4, XMM0 ;
  1888. MOVUPD [RDX], XMM1 ;
  1889. MOVUPD [RDX+16], XMM2 ;
  1890. MOVUPD [RDX+32], XMM3 ;
  1891. MOVUPD [RDX+48], XMM4 ;
  1892. ADD RDX, 64 ;
  1893. SUB RAX, 8 ;
  1894. JMP unaligned8 ;
  1895. ; LOOP FOR 2 pieces unaligned
  1896. unaligned2: ;
  1897. CMP RAX, 2 ;
  1898. JL singlepieces ; len < 2- > EXIT
  1899. MOVUPD XMM1, [RBX] ;
  1900. ADD RBX, 16 ;
  1901. MULPD XMM1, XMM0 ;
  1902. MOVUPD [RDX], XMM1 ;
  1903. ADD RDX, 16 ;
  1904. SUB RAX, 2 ;
  1905. JMP unaligned2 ;
  1906. ; one piece left OR non-contiguous data
  1907. single:
  1908. singlepieces: ;
  1909. CMP RAX, 0 ;
  1910. JLE endL ; len <= 0- > EXIT
  1911. MOVSD XMM1, [RBX]
  1912. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1913. MULSD XMM1, XMM0
  1914. MOVSD [RDX], XMM1
  1915. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1916. DEC RAX ; DEC(len)
  1917. JMP singlepieces ;
  1918. endL:
  1919. END MulAXSXLoopSSE;
  1920. PROCEDURE MulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1921. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  1922. (*
  1923. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  1924. 2.) process starting unaligned data ( using single instructions)
  1925. 3.) process aligned data
  1926. 4.) process remaining unaligned data (using single instructions)
  1927. *)
  1928. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  1929. ; register initialization
  1930. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  1931. CMP RAX, 0 ;
  1932. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1933. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  1934. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  1935. MOV RCX, [RBP+radr] ;
  1936. MOVSS XMM0, [RCX] ;
  1937. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  1938. ; check IF data are contiguous IN memory
  1939. CMP [RBP+linc], 4 ; check left FOR contiunuity
  1940. JNE single ; not continuous- > simplest method
  1941. CMP [RBP+dinc], 4 ; check dest FOR continuity
  1942. JNE single ; not continuous- > simplest method
  1943. ; check FOR alignment
  1944. MOV RCX, RBX ;
  1945. AND RCX, 3 ; ladr MOD 4
  1946. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  1947. JNE unaligned ; not 32 bit aligned
  1948. MOV RCX, RDX ;
  1949. AND RCX, 3 ; dadr MOD 4
  1950. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  1951. JNE unaligned ; not 64 bit aligned
  1952. MOV RSI, RBX ;
  1953. AND RSI, 8+4 ; 16 byte alignment
  1954. MOV RDI, RDX ;
  1955. AND RDI, 8+4 ; 16 byte alignment
  1956. CMP RSI, RDI ;
  1957. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1958. CMP RSI, 0 ;
  1959. JE aligned ; already aligned
  1960. align:
  1961. ; one single element processing UNTIL 128 bt alignment achieved
  1962. MOVSS XMM1, [RBX] ;
  1963. MULSS XMM1, XMM0 ;
  1964. MOVSS [RDX], XMM1 ;
  1965. ADD RBX, 4 ;
  1966. ADD RDX, 4 ;
  1967. DEC RAX ; one element has been processed ;
  1968. CMP RAX, 0 ; all elements already processed?
  1969. JLE single
  1970. MOV RSI, RBX ;
  1971. AND RSI, 8+4 ;
  1972. CMP RSI, 0 ;
  1973. JNE align ;
  1974. aligned:
  1975. aligned16:
  1976. CMP RAX, 16 ;
  1977. JL aligned4 ; len < 4- > EXIT TO singlepieces
  1978. MOVAPS XMM1, [RBX] ;
  1979. MOVAPS XMM2, [RBX+16] ;
  1980. MOVAPS XMM3, [RBX+32] ;
  1981. MOVAPS XMM4, [RBX+48] ;
  1982. ADD RBX, 64 ;
  1983. MULPS XMM1, XMM0 ;
  1984. MULPS XMM2, XMM0 ;
  1985. MULPS XMM3, XMM0 ;
  1986. MULPS XMM4, XMM0 ;
  1987. MOVAPS [RDX], XMM1 ;
  1988. MOVAPS [RDX+16], XMM2 ;
  1989. MOVAPS [RDX+32], XMM3 ;
  1990. MOVAPS [RDX+48], XMM4 ;
  1991. ADD RDX, 64 ;
  1992. SUB RAX, 16 ;
  1993. JMP aligned16 ;
  1994. ; LOOP FOR 2 pieces aligned
  1995. aligned4: ;
  1996. CMP RAX, 4 ;
  1997. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1998. MOVAPS XMM1, [RBX] ;
  1999. ADD RBX, 16 ;
  2000. MULPS XMM1, XMM0 ;
  2001. MOVAPS [RDX], XMM1 ;
  2002. ADD RDX, 16 ;
  2003. SUB RAX, 4 ;
  2004. JMP aligned4 ;
  2005. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2006. unaligned: ;
  2007. unaligned16: ;
  2008. CMP RAX, 16 ;
  2009. JL unaligned4 ; len < 12- > EXIT
  2010. MOVUPS XMM1, [RBX] ;
  2011. MOVUPS XMM2, [RBX+16] ;
  2012. MOVUPS XMM3, [RBX+32] ;
  2013. MOVUPS XMM4, [RBX+48] ;
  2014. ADD RBX, 64
  2015. MULPS XMM1, XMM0 ;
  2016. MULPS XMM2, XMM0 ;
  2017. MULPS XMM3, XMM0 ;
  2018. MULPS XMM4, XMM0 ;
  2019. MOVUPS [RDX], XMM1 ;
  2020. MOVUPS [RDX+16], XMM2 ;
  2021. MOVUPS [RDX+32], XMM3 ;
  2022. MOVUPS [RDX+48], XMM4 ;
  2023. ADD RDX, 64 ;
  2024. SUB RAX, 16 ;
  2025. JMP unaligned16 ;
  2026. ; LOOP FOR 2 pieces unaligned
  2027. unaligned4: ;
  2028. CMP RAX, 4 ;
  2029. JL singlepieces ; len < 2- > EXIT
  2030. MOVUPS XMM1, [RBX] ;
  2031. ADD RBX, 16 ;
  2032. MULPS XMM1, XMM0 ;
  2033. MOVUPS [RDX], XMM1 ;
  2034. ADD RDX, 16 ;
  2035. SUB RAX, 4 ;
  2036. JMP unaligned4 ;
  2037. ; one piece left OR non-contiguous data
  2038. single:
  2039. singlepieces: ;
  2040. CMP RAX, 0 ;
  2041. JLE endL ; len <= 0- > EXIT
  2042. MOVSS XMM1, [RBX]
  2043. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2044. MULSS XMM1, XMM0
  2045. MOVSS [RDX], XMM1
  2046. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2047. DEC RAX ; DEC(len)
  2048. JMP singlepieces ;
  2049. endL:
  2050. END MulARSRLoopSSE;
  2051. PROCEDURE IncMulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2052. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2053. (*
  2054. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2055. 2.) process starting unaligned data ( using single instructions)
  2056. 3.) process aligned data
  2057. 4.) process remaining unaligned data (using single instructions)
  2058. *)
  2059. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2060. ; register initialization
  2061. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2062. CMP RAX, 0 ;
  2063. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2064. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2065. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2066. MOV RCX, [RBP+radr] ;
  2067. MOVSD XMM0, [RCX] ;
  2068. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2069. ; check IF data are contiguous IN memory
  2070. CMP [RBP+linc], 8 ; check left FOR contiunuity
  2071. JNE single ; not continuous- > simplest method
  2072. CMP [RBP+dinc], 8 ; check dest FOR continuity
  2073. JNE single ; not continuous- > simplest method
  2074. ; check FOR alignment
  2075. MOV RCX, RBX ;
  2076. AND RCX, 7 ; ladr MOD 8
  2077. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  2078. JNE unaligned ; not 64 bit aligned
  2079. MOV RCX, RDX ;
  2080. AND RCX, 7 ; dadr MOD 8
  2081. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  2082. JNE unaligned ; not 64 bit aligned
  2083. MOV RSI, RBX ;
  2084. AND RSI, 8 ; 16 byte alignment
  2085. MOV RDI, RDX ;
  2086. AND RDI, 8 ; 16 byte alignment
  2087. CMP RSI, RDI ;
  2088. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2089. CMP RSI, 8 ;
  2090. JNE aligned ; ladr and dadr already 128 bit aligned
  2091. ; one single element processing TO achieve 128 bt alignment
  2092. MOVSD XMM1, [RBX] ;
  2093. MULSD XMM1, XMM0 ;
  2094. MOVSD XMM2, [RDX] ;
  2095. ADDSD XMM1, XMM2 ;
  2096. MOVSD [RDX], XMM1 ;
  2097. ADD RBX, 8 ; now RBX IS 16 byte aligned
  2098. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  2099. DEC RAX ; one element has been processed
  2100. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2101. aligned:
  2102. aligned8:
  2103. CMP RAX, 8 ;
  2104. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2105. MOVAPD XMM1, [RBX] ;
  2106. MOVAPD XMM2, [RBX+16] ;
  2107. MOVAPD XMM3, [RBX+32] ;
  2108. MOVAPD XMM4, [RBX+48] ;
  2109. ADD RBX, 64 ;
  2110. MULPD XMM1, XMM0 ;
  2111. MULPD XMM2, XMM0 ;
  2112. MULPD XMM3, XMM0 ;
  2113. MULPD XMM4, XMM0 ;
  2114. MOVAPD XMM5, [RDX] ;
  2115. ADDPD XMM1, XMM5
  2116. MOVAPD [RDX], XMM1 ;
  2117. MOVAPD XMM6, [RDX+16] ;
  2118. ADDPD XMM2, XMM6
  2119. MOVAPD [RDX+16], XMM2 ;
  2120. MOVAPD XMM7, [RDX+32] ;
  2121. ADDPD XMM3, XMM7
  2122. MOVAPD [RDX+32], XMM3 ;
  2123. MOVAPD XMM5, [RDX+48] ;
  2124. ADDPD XMM4, XMM5
  2125. MOVAPD [RDX+48], XMM4 ;
  2126. ADD RDX, 64 ;
  2127. SUB RAX, 8 ;
  2128. JMP aligned8 ;
  2129. ; LOOP FOR 2 pieces aligned
  2130. aligned2: ;
  2131. CMP RAX, 2 ;
  2132. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2133. MOVAPD XMM1, [RBX] ;
  2134. ADD RBX, 16 ;
  2135. MULPD XMM1, XMM0 ;
  2136. MOVAPD XMM2, [RDX] ;
  2137. ADDPD XMM1, XMM2
  2138. MOVAPD [RDX], XMM1 ;
  2139. ADD RDX, 16 ;
  2140. SUB RAX, 2 ;
  2141. JMP aligned2 ;
  2142. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2143. unaligned: ;
  2144. unaligned8: ;
  2145. CMP RAX, 8 ;
  2146. JL unaligned2 ; len < 12- > EXIT
  2147. MOVUPD XMM1, [RBX] ;
  2148. MOVUPD XMM2, [RBX+16] ;
  2149. MOVUPD XMM3, [RBX+32] ;
  2150. MOVUPD XMM4, [RBX+48] ;
  2151. ADD RBX, 64
  2152. MULPD XMM1, XMM0 ;
  2153. MULPD XMM2, XMM0 ;
  2154. MULPD XMM3, XMM0 ;
  2155. MULPD XMM4, XMM0 ;
  2156. MOVUPD XMM5, [RDX] ;
  2157. ADDPD XMM1, XMM5
  2158. MOVUPD [RDX], XMM1 ;
  2159. MOVUPD XMM6, [RDX+16] ;
  2160. ADDPD XMM2, XMM6
  2161. MOVUPD [RDX+16], XMM2 ;
  2162. MOVUPD XMM7, [RDX+32] ;
  2163. ADDPD XMM3, XMM7
  2164. MOVUPD [RDX+32], XMM3 ;
  2165. MOVUPD XMM5, [RDX+48] ;
  2166. ADDPD XMM4, XMM5
  2167. MOVUPD [RDX+48], XMM4 ;
  2168. ADD RDX, 64 ;
  2169. SUB RAX, 8 ;
  2170. JMP unaligned8 ;
  2171. ; LOOP FOR 2 pieces unaligned
  2172. unaligned2: ;
  2173. CMP RAX, 2 ;
  2174. JL singlepieces ; len < 2- > EXIT
  2175. MOVUPD XMM1, [RBX] ;
  2176. ADD RBX, 16 ;
  2177. MULPD XMM1, XMM0 ;
  2178. MOVUPD XMM2, [RDX] ;
  2179. ADDPD XMM1, XMM2
  2180. MOVUPD [RDX], XMM1 ;
  2181. ADD RDX, 16 ;
  2182. SUB RAX, 2 ;
  2183. JMP unaligned2 ;
  2184. ; one piece left OR non-contiguous data
  2185. single:
  2186. singlepieces: ;
  2187. CMP RAX, 0 ;
  2188. JLE endL ; len <= 0- > EXIT
  2189. MOVSD XMM1, [RBX]
  2190. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2191. MULSD XMM1, XMM0
  2192. MOVSD XMM2, [RDX] ;
  2193. ADDSD XMM1, XMM2
  2194. MOVSD [RDX], XMM1
  2195. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2196. DEC RAX ; DEC(len)
  2197. JMP singlepieces ;
  2198. endL:
  2199. END IncMulAXSXLoopSSE;
  2200. PROCEDURE IncMulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2201. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2202. (*
  2203. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2204. 2.) process starting unaligned data ( using single instructions)
  2205. 3.) process aligned data
  2206. 4.) process remaining unaligned data (using single instructions)
  2207. *)
  2208. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  2209. ; register initialization
  2210. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2211. CMP RAX, 0 ;
  2212. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2213. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2214. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2215. MOV RCX, [RBP+radr] ;
  2216. MOVSS XMM0, [RCX] ;
  2217. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2218. ; check IF data are contiguous IN memory
  2219. CMP [RBP+linc], 4 ; check left FOR contiunuity
  2220. JNE single ; not continuous- > simplest method
  2221. CMP [RBP+dinc], 4 ; check dest FOR continuity
  2222. JNE single ; not continuous- > simplest method
  2223. ; check FOR alignment
  2224. MOV RCX, RBX ;
  2225. AND RCX, 3 ; ladr MOD 4
  2226. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  2227. JNE unaligned ; not 32 bit aligned
  2228. MOV RCX, RDX ;
  2229. AND RCX, 3 ; dadr MOD 4
  2230. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  2231. JNE unaligned ; not 64 bit aligned
  2232. MOV RSI, RBX ;
  2233. AND RSI, 8+4 ; 16 byte alignment
  2234. MOV RDI, RDX ;
  2235. AND RDI, 8+4 ; 16 byte alignment
  2236. CMP RSI, RDI ;
  2237. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2238. CMP RSI, 0 ;
  2239. JE aligned ; already aligned
  2240. align:
  2241. ; one single element processing UNTIL 128 bt alignment achieved
  2242. MOVSS XMM1, [RBX] ;
  2243. MULSS XMM1, XMM0 ;
  2244. MOVSS XMM2, [RDX] ;
  2245. ADDSS XMM1, XMM2 ;
  2246. MOVSS [RDX], XMM1 ;
  2247. ADD RBX, 4 ;
  2248. ADD RDX, 4 ;
  2249. DEC RAX ; one element has been processed ;
  2250. CMP RAX, 0 ; all elements already processed?
  2251. JLE single
  2252. MOV RSI, RBX ;
  2253. AND RSI, 8+4 ;
  2254. CMP RSI, 0 ;
  2255. JNE align ;
  2256. aligned:
  2257. aligned16:
  2258. CMP RAX, 16 ;
  2259. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2260. MOVAPS XMM1, [RBX] ;
  2261. MOVAPS XMM2, [RBX+16] ;
  2262. MOVAPS XMM3, [RBX+32] ;
  2263. MOVAPS XMM4, [RBX+48] ;
  2264. ADD RBX, 64 ;
  2265. MULPS XMM1, XMM0 ;
  2266. MULPS XMM2, XMM0 ;
  2267. MULPS XMM3, XMM0 ;
  2268. MULPS XMM4, XMM0 ;
  2269. MOVAPS XMM5, [RDX] ;
  2270. ADDPS XMM1, XMM5 ;
  2271. MOVAPS [RDX], XMM1 ;
  2272. MOVAPS XMM6, [RDX+16] ;
  2273. ADDPS XMM2, XMM6 ;
  2274. MOVAPS [RDX+16], XMM2 ;
  2275. MOVAPS XMM7, [RDX+32] ;
  2276. ADDPS XMM3, XMM7 ;
  2277. MOVAPS [RDX+32], XMM3 ;
  2278. MOVAPS XMM5, [RDX+48] ;
  2279. ADDPS XMM4, XMM5 ;
  2280. MOVAPS [RDX+48], XMM4 ;
  2281. ADD RDX, 64 ;
  2282. SUB RAX, 16 ;
  2283. JMP aligned16 ;
  2284. ; LOOP FOR 2 pieces aligned
  2285. aligned4: ;
  2286. CMP RAX, 4 ;
  2287. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2288. MOVAPS XMM1, [RBX] ;
  2289. ADD RBX, 16 ;
  2290. MULPS XMM1, XMM0 ;
  2291. MOVAPS XMM2, [RDX] ;
  2292. ADDPS XMM1, XMM2 ;
  2293. MOVAPS [RDX], XMM1 ;
  2294. ADD RDX, 16 ;
  2295. SUB RAX, 4 ;
  2296. JMP aligned4 ;
  2297. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2298. unaligned: ;
  2299. unaligned16: ;
  2300. CMP RAX, 16 ;
  2301. JL unaligned4 ; len < 12- > EXIT
  2302. MOVUPS XMM1, [RBX] ;
  2303. MOVUPS XMM2, [RBX+16] ;
  2304. MOVUPS XMM3, [RBX+32] ;
  2305. MOVUPS XMM4, [RBX+48] ;
  2306. ADD RBX, 64
  2307. MULPS XMM1, XMM0 ;
  2308. MULPS XMM2, XMM0 ;
  2309. MULPS XMM3, XMM0 ;
  2310. MULPS XMM4, XMM0 ;
  2311. MOVUPS XMM5, [RDX] ;
  2312. ADDPS XMM1, XMM5 ;
  2313. MOVUPS [RDX], XMM1 ;
  2314. MOVUPS XMM6, [RDX+16] ;
  2315. ADDPS XMM2, XMM6 ;
  2316. MOVUPS [RDX+16], XMM2 ;
  2317. MOVUPS XMM7, [RDX+32] ;
  2318. ADDPS XMM3, XMM7 ;
  2319. MOVUPS [RDX+32], XMM3 ;
  2320. MOVUPS XMM5, [RDX+48] ;
  2321. ADDPS XMM4, XMM5 ;
  2322. MOVUPS [RDX+48], XMM4 ;
  2323. ADD RDX, 64 ;
  2324. SUB RAX, 16 ;
  2325. JMP unaligned16 ;
  2326. ; LOOP FOR 2 pieces unaligned
  2327. unaligned4: ;
  2328. CMP RAX, 4 ;
  2329. JL singlepieces ; len < 2- > EXIT
  2330. MOVUPS XMM1, [RBX] ;
  2331. ADD RBX, 16 ;
  2332. MULPS XMM1, XMM0 ;
  2333. MOVUPS XMM2, [RDX] ;
  2334. ADDPS XMM1, XMM2 ;
  2335. MOVUPS [RDX], XMM1 ;
  2336. ADD RDX, 16 ;
  2337. SUB RAX, 4 ;
  2338. JMP unaligned4 ;
  2339. ; one piece left OR non-contiguous data
  2340. single:
  2341. singlepieces: ;
  2342. CMP RAX, 0 ;
  2343. JLE endL ; len <= 0- > EXIT
  2344. MOVSS XMM1, [RBX]
  2345. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2346. MULSS XMM1, XMM0
  2347. MOVSS XMM2, [RDX] ;
  2348. ADDSS XMM1, XMM2 ;
  2349. MOVSS [RDX], XMM1
  2350. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2351. DEC RAX ; DEC(len)
  2352. JMP singlepieces ;
  2353. endL:
  2354. END IncMulARSRLoopSSE;
  2355. (*
  2356. PROCEDURE AlignedSPXSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  2357. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2358. ; ; register initialization
  2359. MOV RDX, [RBP+dadr] ; RDX reserved for dadr
  2360. MOV RBX, [RBP+ladr] ; RBX reserved for ladr
  2361. MOV RSI, [RBP+radr] ; RSI reserved for radr
  2362. MOV RAX, [RBP+len] ; RAX reserverd for length
  2363. MOV RCX, [RBP+stride] ; RCX reserved for stride
  2364. XORPD XMM2, XMM2 ;
  2365. XORPD XMM3, XMM3 ;
  2366. XORPD XMM4, XMM4 ;
  2367. XORPD XMM5, XMM5 ;
  2368. XORPD XMM6, XMM6 ;
  2369. XOR RDI, RDI ;
  2370. aligned4:
  2371. CMP RAX, 4 ;
  2372. JL aligned2 ; ; len < 4- > exit to singlepieces
  2373. MOV RSI, [RBP+radr] ;
  2374. ADD RSI, RDI ;
  2375. MOVAPD XMM7, [RBX] ;
  2376. MOVAPD XMM0, [RSI] ;
  2377. ADD RSI, RCX ;
  2378. MOVAPD XMM1, [RSI] ;
  2379. MULPD XMM0, XMM7 ;
  2380. ADDPD XMM2, XMM0 ;
  2381. ADD RSI, RCX ;
  2382. MOVAPD XMM0, [RSI] ;
  2383. MULPD XMM1, XMM7 ;
  2384. ADDPD XMM3, XMM1 ;
  2385. ADD RSI, RCX ;
  2386. MOVAPD XMM1, [RSI] ;
  2387. MULPD XMM0, XMM7 ;
  2388. ADDPD XMM4, XMM0 ;
  2389. ADD RSI, RCX ;
  2390. MOVAPD XMM0, [RSI] ;
  2391. MULPD XMM1, XMM7 ;
  2392. ADDPD XMM5, XMM1 ;
  2393. MULPD XMM0, XMM7 ;
  2394. ADDPD XMM6, XMM0 ;
  2395. ADD RBX, 16 ;
  2396. ADD RDI, 16 ;
  2397. MOV RSI, [RBP+radr] ;
  2398. ADD RSI, RDI ;
  2399. MOVAPD XMM7, [RBX] ;
  2400. MOVAPD XMM0, [RSI] ;
  2401. ADD RSI, RCX ;
  2402. MOVAPD XMM1, [RSI] ;
  2403. MULPD XMM0, XMM7 ;
  2404. ADDPD XMM2, XMM0 ;
  2405. ADD RSI, RCX ;
  2406. MOVAPD XMM0, [RSI] ;
  2407. MULPD XMM1, XMM7 ;
  2408. ADDPD XMM3, XMM1 ;
  2409. ADD RSI, RCX ;
  2410. MOVAPD XMM1, [RSI] ;
  2411. MULPD XMM0, XMM7 ;
  2412. ADDPD XMM4, XMM0 ;
  2413. ADD RSI, RCX ;
  2414. MOVAPD XMM0, [RSI] ;
  2415. MULPD XMM1, XMM7 ;
  2416. ADDPD XMM5, XMM1 ;
  2417. MULPD XMM0, XMM7 ;
  2418. ADDPD XMM6, XMM0 ;
  2419. ADD RBX, 16 ;
  2420. ADD RDI, 16 ;
  2421. SUB RAX, 4 ;
  2422. JMP aligned4 ;
  2423. aligned2:
  2424. CMP RAX, 2 ;
  2425. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2426. MOV RSI, [RBP+radr] ;
  2427. ADD RSI, RDI ;
  2428. MOVAPD XMM7, [RBX] ;
  2429. MOVAPD XMM0, [RSI] ;
  2430. ADD RSI, RCX ;
  2431. MOVAPD XMM1, [RSI] ;
  2432. MULPD XMM0, XMM7 ;
  2433. ADDPD XMM2, XMM0 ;
  2434. ADD RSI, RCX ;
  2435. MOVAPD XMM0, [RSI] ;
  2436. MULPD XMM1, XMM7 ;
  2437. ADDPD XMM3, XMM1 ;
  2438. ADD RSI, RCX ;
  2439. MOVAPD XMM1, [RSI] ;
  2440. MULPD XMM0, XMM7 ;
  2441. ADDPD XMM4, XMM0 ;
  2442. ADD RSI, RCX ;
  2443. MOVAPD XMM0, [RSI] ;
  2444. MULPD XMM1, XMM7 ;
  2445. ADDPD XMM5, XMM1 ;
  2446. MULPD XMM0, XMM7 ;
  2447. ADDPD XMM6, XMM0 ;
  2448. ADD RBX, 16 ;
  2449. ADD RDI, 16 ;
  2450. SUB RAX, 2 ;
  2451. JMP aligned2 ;
  2452. horizontaladd: ;
  2453. MOVAPD XMM1, XMM2 ;
  2454. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2455. ADDPD XMM2, XMM1 ;
  2456. MOVAPD XMM1, XMM3 ;
  2457. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2458. ADDPD XMM3, XMM1 ;
  2459. MOVAPD XMM1, XMM4 ;
  2460. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2461. ADDPD XMM4, XMM1 ;
  2462. MOVAPD XMM1, XMM5 ;
  2463. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2464. ADDPD XMM5, XMM1 ;
  2465. MOVAPD XMM1, XMM6 ;
  2466. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2467. ADDPD XMM6, XMM1 ;
  2468. singlepieces: ;
  2469. CMP RAX, 0 ;
  2470. JLE store ; len <= 0- > exit
  2471. MOV RSI, [RBP+radr] ;
  2472. MOVSD XMM7, [RBX] ;
  2473. MOVSD XMM0, [RSI+RDI] ;
  2474. ADD RSI, RCX ;
  2475. MOVSD XMM1, [RSI+RDI] ;
  2476. MULSD XMM0, XMM7 ;
  2477. ADDSD XMM2, XMM0 ;
  2478. ADD RSI, RCX ;
  2479. MOVSD XMM0, [RSI+RDI] ;
  2480. MULSD XMM1, XMM7 ;
  2481. ADDSD XMM3, XMM1 ;
  2482. ADD RSI, RCX ;
  2483. MOVSD XMM1, [RSI+RDI] ;
  2484. MULSD XMM0, XMM7 ;
  2485. ADDSD XMM4, XMM0 ;
  2486. ADD RSI, RCX ;
  2487. MOVSD XMM1, [RSI+RDI] ;
  2488. MULSD XMM0, XMM7 ;
  2489. ADDSD XMM4, XMM0 ;
  2490. ADD RSI, RCX ;
  2491. MOVSD XMM0, [RSI+RDI] ;
  2492. MULSD XMM1, XMM7 ;
  2493. ADDSD XMM5, XMM1 ;
  2494. MULSD XMM0, XMM7 ;
  2495. ADDSD XMM6, XMM0 ;
  2496. ADD RBX, 4 (* INC(ladr,incl) *)
  2497. ADD RDI, 4 (* INC(radr,incr) *)
  2498. DEC RAX ; DEC(len)
  2499. JMP singlepieces ;
  2500. store:
  2501. MOVSD [RDX], XMM2 ;
  2502. ADD RDX, [RBP+incd] ;
  2503. MOVSD [RDX], XMM3 ;
  2504. ADD RDX, [RBP+incd] ;
  2505. MOVSD [RDX], XMM4 ;
  2506. ADD RDX, [RBP+incd] ;
  2507. MOVSD [RDX], XMM5 ;
  2508. ADD RDX, [RBP+incd] ;
  2509. MOVSD [RDX], XMM6 ;
  2510. end:
  2511. END AlignedSPXSSE5;
  2512. *)
  2513. (* sse version of scalar product *)
  2514. PROCEDURE AlignedSPXSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  2515. add: BOOLEAN );
  2516. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2517. ; register initialization
  2518. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2519. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2520. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  2521. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2522. XORPD XMM0, XMM0 ;
  2523. CMP [RBP+add], 0 ; add?
  2524. JE aligned8 ; no add
  2525. MOVSD XMM0, [RDX] ;
  2526. aligned8:
  2527. CMP RAX, 8 ;
  2528. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2529. MOVAPD XMM1, [RBX] ;
  2530. MOVAPD XMM2, [RBX+16] ;
  2531. MOVAPD XMM3, [RBX+32] ;
  2532. MOVAPD XMM4, [RCX] ;
  2533. MOVAPD XMM5, [RCX+16] ;
  2534. MOVAPD XMM6, [RCX+32] ;
  2535. MULPD XMM1, XMM4 ;
  2536. ADDPD XMM0, XMM1 ;
  2537. MULPD XMM2, XMM5 ;
  2538. ADDPD XMM0, XMM2 ;
  2539. MULPD XMM3, XMM6 ;
  2540. ADDPD XMM0, XMM3 ;
  2541. MOVAPD XMM7, [RBX+48] ;
  2542. MOVAPD XMM1, [RCX+48] ;
  2543. MULPD XMM1, XMM7 ;
  2544. ADDPD XMM0, XMM1 ;
  2545. ADD RBX, 64 ;
  2546. ADD RCX, 64 ;
  2547. SUB RAX, 8 ;
  2548. JMP aligned8 ;
  2549. ; LOOP FOR 2 pieces aligned
  2550. aligned4:
  2551. CMP RAX, 4 ;
  2552. JL aligned2 ; ; len < 4- > EXIT TO singlepieces
  2553. MOVAPD XMM1, [RBX] ;
  2554. MOVAPD XMM2, [RCX] ;
  2555. MOVAPD XMM3, [RBX+16] ;
  2556. MOVAPD XMM4, [RCX+16] ;
  2557. MULPD XMM1, XMM2 ;
  2558. ADDPD XMM0, XMM1 ;
  2559. MULPD XMM3, XMM4 ;
  2560. ADDPD XMM0, XMM3 ;
  2561. ADD RBX, 32 ;
  2562. ADD RCX, 32 ;
  2563. SUB RAX, 4 ;
  2564. JMP aligned4 ;
  2565. aligned2:
  2566. CMP RAX, 2 ;
  2567. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2568. MOVAPD XMM1, [RBX] ;
  2569. MOVAPD XMM2, [RCX] ;
  2570. MULPD XMM1, XMM2 ;
  2571. ADDPD XMM0, XMM1 ;
  2572. ADD RBX, 16 ;
  2573. ADD RCX, 16 ;
  2574. SUB RAX, 2 ;
  2575. JMP aligned2 ;
  2576. horizontaladd: ;
  2577. MOVAPD XMM1, XMM0 ;
  2578. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2579. ADDPD XMM0, XMM1 ;
  2580. singlepieces: ;
  2581. CMP RAX, 0 ;
  2582. JLE store ; len <= 0- > EXIT
  2583. MOVSD XMM1, [RBX]
  2584. MOVSD XMM2, [RCX]
  2585. MULSD XMM1, XMM2
  2586. ADDSD XMM0, XMM1
  2587. ADD RBX, 8 ; INC(ladr, incl)
  2588. ADD RCX, 8 ; INC(radr, incr)
  2589. DEC RAX ; DEC(len)
  2590. JMP singlepieces ;
  2591. store:
  2592. MOVSD [RDX], XMM0 ;
  2593. endL:
  2594. END AlignedSPXSSE;
  2595. (*
  2596. PROCEDURE AlignedSPRSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  2597. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  2598. ; register initialization
  2599. MOV RDX, [RBP+dadr] ; RDX reserved for dadr
  2600. MOV RBX, [RBP+ladr] ; RBX reserved for ladr
  2601. MOV RSI, [RBP+radr] ; RCX reserved for radr
  2602. MOV RAX, [RBP+len] ; RAX reserverd for length
  2603. MOV RCX, [RBP+stride] ;
  2604. XORPS XMM2, XMM2 ;
  2605. XORPS XMM3, XMM3 ;
  2606. XORPS XMM4, XMM4 ;
  2607. XORPS XMM5, XMM5 ;
  2608. XORPS XMM6, XMM6 ;
  2609. XOR RDI, RDI ;
  2610. aligned8:
  2611. CMP RAX, 8 ;
  2612. JL aligned4 ; ; len < 4- > exit to singlepieces
  2613. PREFETCH0 24[RBX] ;
  2614. ; PREFETCH0[RSI] ;
  2615. MOV RSI, [RBP+radr] ;
  2616. ADD RSI, RDI ;
  2617. MOVAPS XMM7, [RBX] ;
  2618. MOVAPS XMM0, [RSI] ;
  2619. ADD RSI, RCX ;
  2620. MOVAPS XMM1, [RSI] ;
  2621. MULPS XMM0, XMM7 ;
  2622. ADDPS XMM2, XMM0 ;
  2623. ADD RSI, RCX ;
  2624. MOVAPS XMM0, [RSI] ;
  2625. MULPS XMM1, XMM7 ;
  2626. ADDPS XMM3, XMM1 ;
  2627. ADD RSI, RCX ;
  2628. MOVAPS XMM1, [RSI] ;
  2629. MULPS XMM0, XMM7 ;
  2630. ADDPS XMM4, XMM0 ;
  2631. ADD RSI, RCX ;
  2632. MOVAPS XMM0, [RSI] ;
  2633. MULPS XMM1, XMM7 ;
  2634. ADDPS XMM5, XMM1 ;
  2635. MULPS XMM0, XMM7 ;
  2636. ADDPS XMM6, XMM0 ;
  2637. ADD RBX, 16 ;
  2638. ADD RDI, 16 ;
  2639. MOV RSI, [RBP+radr] ;
  2640. ADD RSI, RDI ;
  2641. MOVAPS XMM7, [RBX] ;
  2642. MOVAPS XMM0, [RSI] ;
  2643. ADD RSI, RCX ;
  2644. MOVAPS XMM1, [RSI] ;
  2645. MULPS XMM0, XMM7 ;
  2646. ADDPS XMM2, XMM0 ;
  2647. ADD RSI, RCX ;
  2648. MOVAPS XMM0, [RSI] ;
  2649. MULPS XMM1, XMM7 ;
  2650. ADDPS XMM3, XMM1 ;
  2651. ADD RSI, RCX ;
  2652. MOVAPS XMM1, [RSI] ;
  2653. MULPS XMM0, XMM7 ;
  2654. ADDPS XMM4, XMM0 ;
  2655. ADD RSI, RCX ;
  2656. MOVAPS XMM0, [RSI] ;
  2657. MULPS XMM1, XMM7 ;
  2658. ADDPS XMM5, XMM1 ;
  2659. MULPS XMM0, XMM7 ;
  2660. ADDPS XMM6, XMM0 ;
  2661. ADD RBX, 16 ;
  2662. ADD RDI, 16 ;
  2663. SUB RAX, 8 ;
  2664. JMP aligned8 ;
  2665. aligned4:
  2666. CMP RAX, 4 ;
  2667. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2668. MOV RSI, [RBP+radr] ;
  2669. ADD RSI, RDI ;
  2670. MOVAPS XMM7, [RBX] ;
  2671. MOVAPS XMM0, [RSI] ;
  2672. ADD RSI, RCX ;
  2673. MOVAPS XMM1, [RSI] ;
  2674. MULPS XMM0, XMM7 ;
  2675. ADDPS XMM2, XMM0 ;
  2676. ADD RSI, RCX ;
  2677. MOVAPS XMM0, [RSI] ;
  2678. MULPS XMM1, XMM7 ;
  2679. ADDPS XMM3, XMM1 ;
  2680. ADD RSI, RCX ;
  2681. MOVAPS XMM1, [RSI] ;
  2682. MULPS XMM0, XMM7 ;
  2683. ADDPS XMM4, XMM0 ;
  2684. ADD RSI, RCX ;
  2685. MOVAPS XMM0, [RSI] ;
  2686. MULPS XMM1, XMM7 ;
  2687. ADDPS XMM5, XMM1 ;
  2688. MULPS XMM0, XMM7 ;
  2689. ADDPS XMM6, XMM0 ;
  2690. ADD RBX, 16 ;
  2691. ADD RDI, 16 ;
  2692. SUB RAX, 4 ;
  2693. JMP aligned4 ;
  2694. horizontaladd: ;
  2695. MOVLHPS XMM1, XMM2 ;
  2696. ADDPS XMM1, XMM2 ;
  2697. SHUFPS XMM2, XMM1, 48 ;
  2698. ADDPS XMM2, XMM1 ;
  2699. MOVHLPS XMM2, XMM2 ;
  2700. MOVLHPS XMM1, XMM3 ;
  2701. ADDPS XMM1, XMM3 ;
  2702. SHUFPS XMM3, XMM1, 48 ;
  2703. ADDPS XMM3, XMM1 ;
  2704. MOVHLPS XMM3, XMM3 ;
  2705. MOVLHPS XMM1, XMM4 ;
  2706. ADDPS XMM1, XMM4 ;
  2707. SHUFPS XMM4, XMM1, 48 ;
  2708. ADDPS XMM4, XMM1 ;
  2709. MOVHLPS XMM4, XMM4 ;
  2710. MOVLHPS XMM1, XMM5 ;
  2711. ADDPS XMM1, XMM5 ;
  2712. SHUFPS XMM5, XMM1, 48 ;
  2713. ADDPS XMM5, XMM1 ;
  2714. MOVHLPS XMM5, XMM5 ;
  2715. MOVLHPS XMM1, XMM6 ;
  2716. ADDPS XMM1, XMM6 ;
  2717. SHUFPS XMM6, XMM1, 48 ;
  2718. ADDPS XMM6, XMM1 ;
  2719. MOVHLPS XMM6, XMM6 ;
  2720. singlepieces: ;
  2721. CMP RAX, 0 ;
  2722. JLE store ; len <= 0- > exit
  2723. MOV RSI, [RBP+radr] ;
  2724. MOVSS XMM7, [RBX] ;
  2725. MOVSS XMM0, [RSI+RDI] ;
  2726. ADD RSI, RCX ;
  2727. MOVSS XMM1, [RSI+RDI] ;
  2728. MULSS XMM0, XMM7 ;
  2729. ADDSS XMM2, XMM0 ;
  2730. ADD RSI, RCX ;
  2731. MOVSS XMM0, [RSI+RDI] ;
  2732. MULSS XMM1, XMM7 ;
  2733. ADDSS XMM3, XMM1 ;
  2734. ADD RSI, RCX ;
  2735. MOVSS XMM1, [RSI+RDI] ;
  2736. MULSS XMM0, XMM7 ;
  2737. ADDSS XMM4, XMM0 ;
  2738. ADD RSI, RCX ;
  2739. MOVSS XMM0, [RSI+RDI] ;
  2740. MULSS XMM1, XMM7 ;
  2741. ADDSS XMM5, XMM1 ;
  2742. MULSS XMM0, XMM7 ;
  2743. ADDSS XMM6, XMM0 ;
  2744. ADD RBX, 4 (* INC(ladr,incl) *)
  2745. ADD RDI, 4 (* INC(radr,incr) *)
  2746. DEC RAX ; DEC(len)
  2747. JMP singlepieces ;
  2748. store:
  2749. MOVSS [RDX], XMM2 ;
  2750. ADD RDX, [RBP+incd] ;
  2751. MOVSS [RDX], XMM3 ;
  2752. ADD RDX, [RBP+incd] ;
  2753. MOVSS [RDX], XMM4 ;
  2754. ADD RDX, [RBP+incd] ;
  2755. MOVSS [RDX], XMM5 ;
  2756. ADD RDX, [RBP+incd] ;
  2757. MOVSS [RDX], XMM6 ;
  2758. end:
  2759. END AlignedSPRSSE5;
  2760. *)
  2761. PROCEDURE AlignedSPRSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  2762. add: BOOLEAN );
  2763. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  2764. ; register initialization
  2765. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2766. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2767. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  2768. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2769. XORPS XMM0, XMM0 ;
  2770. CMP [RBP+add], 0 ; add?
  2771. JE aligned16 ; no add
  2772. MOVSS XMM0, [RDX] ;
  2773. aligned16:
  2774. CMP RAX, 16 ;
  2775. JL aligned8 ; len < 4- > EXIT TO singlepieces
  2776. MOVAPS XMM1, [RBX] ;
  2777. MOVAPS XMM4, [RCX] ;
  2778. MOVAPS XMM2, [RBX+16] ;
  2779. MOVAPS XMM5, [RCX+16] ;
  2780. MULPS XMM1, XMM4 ;
  2781. ADDPS XMM0, XMM1 ;
  2782. MOVAPS XMM3, [RBX+32] ;
  2783. MOVAPS XMM6, [RCX+32] ;
  2784. MULPS XMM2, XMM5 ;
  2785. ADDPS XMM0, XMM2 ;
  2786. MOVAPS XMM7, [RBX+48] ;
  2787. MOVAPS XMM1, [RCX+48] ;
  2788. MULPS XMM3, XMM6 ;
  2789. ADDPS XMM0, XMM3 ;
  2790. MULPS XMM1, XMM7 ;
  2791. ADDPS XMM0, XMM1 ;
  2792. ADD RBX, 64 ;
  2793. ADD RCX, 64 ;
  2794. SUB RAX, 16 ;
  2795. JMP aligned16 ;
  2796. ; LOOP FOR 8 pieces aligned
  2797. aligned8:
  2798. CMP RAX, 8 ;
  2799. JL aligned4 ; ; len < 4- > EXIT TO singlepieces
  2800. MOVAPS XMM1, [RBX] ;
  2801. MOVAPS XMM4, [RCX] ;
  2802. MOVAPS XMM2, [RBX+16] ;
  2803. MOVAPS XMM5, [RCX+16] ;
  2804. MULPS XMM1, XMM4 ;
  2805. ADDPS XMM0, XMM1 ;
  2806. MULPS XMM2, XMM5 ;
  2807. ADDPS XMM0, XMM2 ;
  2808. ADD RBX, 32 ;
  2809. ADD RCX, 32 ;
  2810. SUB RAX, 8 ;
  2811. JMP aligned8 ;
  2812. aligned4:
  2813. CMP RAX, 4 ;
  2814. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2815. MOVAPS XMM1, [RBX] ;
  2816. MOVAPS XMM2, [RCX] ;
  2817. MULPS XMM1, XMM2 ;
  2818. ADDPS XMM0, XMM1 ;
  2819. ADD RBX, 16 ;
  2820. ADD RCX, 16 ;
  2821. SUB RAX, 4 ;
  2822. JMP aligned4 ;
  2823. horizontaladd: ;
  2824. MOVAPS XMM1, XMM0 ;
  2825. ; 1*0 (* dest 0 -> dest 0 *) + 4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  2826. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  2827. ADDPS XMM1, XMM0 ;
  2828. MOVAPS XMM0, XMM1
  2829. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  2830. ADDPS XMM0, XMM1 ;
  2831. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  2832. singlepieces: ;
  2833. CMP RAX, 0 ;
  2834. JLE store ; len <= 0- > EXIT
  2835. MOVSS XMM1, [RBX]
  2836. MOVSS XMM2, [RCX]
  2837. MULSS XMM1, XMM2
  2838. ADDSS XMM0, XMM1
  2839. ADD RBX, 4 ; INC(ladr, incl)
  2840. ADD RCX, 4 ; INC(radr, incr)
  2841. DEC RAX ; DEC(len)
  2842. JMP singlepieces ;
  2843. store:
  2844. MOVSS [RDX], XMM0 ;
  2845. endL:
  2846. END AlignedSPRSSE;
  2847. (*
  2848. (* sse version of scalar product *)
  2849. PROCEDURE AlignedSPRSSE( ladr, radr, dadr, rows, stride, dinc, len: LONGINT );
  2850. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2851. ; register initialization
  2852. MOV RDI, [RBP+radr] ; radr start
  2853. MOV RDX, [RBP+dadr] ; RDX reserved for dadr
  2854. MOV RSI, [RBP+rows] ; outer loop counter
  2855. outerloop:
  2856. CMP RSI, 0 ;
  2857. JLE end ;
  2858. MOV RBX, [RBP+ladr] ; RBX reserved for ladr
  2859. MOV RCX, RDI ; RCX reserved for radr
  2860. MOV RAX, [RBP+len] ; RAX reserverd for length
  2861. XORPS XMM0, XMM0 ;
  2862. aligned16:
  2863. CMP RAX, 16 ;
  2864. JL aligned8 ; len < 4- > exit to singlepieces
  2865. MOVAPS XMM1, [RBX] ;
  2866. MOVAPS XMM2, [RBX+16] ;
  2867. MOVAPS XMM3, [RBX+32] ;
  2868. MOVAPS XMM4, [RCX] ;
  2869. MOVAPS XMM5, [RCX+16] ;
  2870. MOVAPS XMM6, [RCX+32] ;
  2871. MULPS XMM1, XMM4 ;
  2872. ADDPS XMM0, XMM1 ;
  2873. MULPS XMM2, XMM5 ;
  2874. ADDPS XMM0, XMM2 ;
  2875. MULPS XMM3, XMM6 ;
  2876. ADDPS XMM0, XMM3 ;
  2877. MOVAPS XMM7, [RBX+48] ;
  2878. MOVAPS XMM1, [RCX+48] ;
  2879. MULPS XMM1, XMM7 ;
  2880. ADDPS XMM0, XMM1 ;
  2881. ADD RBX, 64 ;
  2882. ADD RCX, 64 ;
  2883. SUB RAX, 16 ;
  2884. JMP aligned16 ;
  2885. ; loop for 8 pieces aligned
  2886. aligned8:
  2887. CMP RAX, 8 ;
  2888. JL aligned4 ; ; len < 4- > exit to singlepieces
  2889. MOVAPS XMM1, [RBX] ;
  2890. MOVAPS XMM2, [RBX+16] ;
  2891. MOVAPS XMM4, [RCX] ;
  2892. MOVAPS XMM5, [RCX+16] ;
  2893. MULPS XMM1, XMM4 ;
  2894. ADDPS XMM0, XMM1 ;
  2895. MULPS XMM2, XMM5 ;
  2896. ADDPS XMM0, XMM2 ;
  2897. ADD RBX, 32 ;
  2898. ADD RCX, 32 ;
  2899. SUB RAX, 8 ;
  2900. JMP aligned8 ;
  2901. aligned4:
  2902. CMP RAX, 4 ;
  2903. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2904. MOVAPS XMM1, [RBX] ;
  2905. MOVAPS XMM2, [RCX] ;
  2906. MULPS XMM1, XMM2 ;
  2907. ADDPS XMM0, XMM1 ;
  2908. ADD RBX, 16 ;
  2909. ADD RCX, 16 ;
  2910. SUB RAX, 4 ;
  2911. JMP aligned4 ;
  2912. horizontaladd: ;
  2913. MOVAPS XMM1, XMM0 ;
  2914. SHUFPS XMM1, XMM1, 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  2915. ADDPS XMM1, XMM0 ;
  2916. MOVAPS XMM0, XMM1
  2917. SHUFPS XMM0, XMM0, 16*3 ; (* src 3-> dest 2 *)
  2918. ADDPS XMM0, XMM1 ;
  2919. SHUFPS XMM0, XMM0, 1*2 ; (* dest 2 -> dest 0 *)
  2920. singlepieces: ;
  2921. CMP RAX, 0 ;
  2922. JLE store ; len <= 0- > exit
  2923. MOVSS XMM1, [RBX]
  2924. MOVSS XMM2, [RCX]
  2925. MULSS XMM1, XMM2
  2926. ADDSS XMM0, XMM1
  2927. ADD RBX, 4 (* INC(ladr,incl) *)
  2928. ADD RCX, 4 (* INC(radr,incr) *)
  2929. DEC RAX ; DEC(len)
  2930. JMP singlepieces ;
  2931. store:
  2932. MOVSS [RDX], XMM0 ;
  2933. ADD RDX, [RBP+dinc] ;
  2934. ADD RDI, [RBP+stride] ;
  2935. DEC RSI ;
  2936. JMP outerloop ;
  2937. end:
  2938. END AlignedSPRSSE;
  2939. *)
  2940. PROCEDURE Copy4( ladr, dadr: ADDRESS; linc, dinc, len: SIZE);
  2941. CODE {SYSTEM.AMD64}
  2942. MOV RSI, [RBP+ladr] ; RCX := ladr
  2943. MOV RDI, [RBP+dadr] ; RDX := dadr
  2944. MOV RCX, [RBP+len] ; RBX := len
  2945. MOV RAX, [RBP+linc] ;
  2946. CMP RAX, 4 ;
  2947. JNE loopL ;
  2948. MOV RAX, [RBP+dinc] ;
  2949. CMP RAX, 4 ;
  2950. JNE loopL ;
  2951. fastmove:
  2952. CLD ; incremental
  2953. REP ;
  2954. MOVSD ; move rest IN one byte steps
  2955. JMP endL ;
  2956. loopL:
  2957. CMP RCX, 0 ;
  2958. JLE endL ; WHILE RCX > 0 DO
  2959. MOV EAX, [RSI] ; RAX := SYSTEM.GET32(RSI)
  2960. MOV [RDI], EAX ; SYSTEM.PUT32(RDI, RAX))
  2961. ADD RSI, [RBP+linc] ; INC(RSI, linc)
  2962. ADD RDI, [RBP+dinc] ; INC(RDI, rinc)
  2963. DEC RCX ; DEC(RCX)
  2964. JMP loopL
  2965. endL:
  2966. END Copy4;
  2967. PROCEDURE Copy8( ladr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2968. CODE {SYSTEM.AMD64}
  2969. MOV RSI, [RBP+ladr] ; RCX := ladr
  2970. MOV RDI, [RBP+dadr] ; RDX := dadr
  2971. MOV RCX, [RBP+len] ; RBX := len
  2972. MOV RAX, [RBP+linc] ;
  2973. CMP RAX, 8 ;
  2974. JNE loopL ;
  2975. MOV RAX, [RBP+dinc] ;
  2976. CMP RAX, 8 ;
  2977. JNE loopL ;
  2978. fastmove:
  2979. SHL RCX, 1 ;
  2980. CLD ; incremental
  2981. REP ;
  2982. MOVSD ; move rest IN one byte steps
  2983. JMP endL ;
  2984. loopL:
  2985. CMP RCX, 0 ;
  2986. JLE endL ; WHILE RBX > 0 DO
  2987. MOV RAX, [RSI] ; RAX := SYSTEM.GET64(RCX)
  2988. MOV [RDI], RAX ; SYSTEM.PUT64(RDX, RAX))
  2989. ADD RSI, [RBP+linc] ; INC(RCX, linc)
  2990. ADD RDI, [RBP+dinc] ; INC(RDX, rinc)
  2991. DEC RCX ; DEC(RBX)
  2992. JMP loopL
  2993. endL:
  2994. END Copy8;
  2995. PROCEDURE Transpose4A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  2996. CODE {SYSTEM.AMD64}
  2997. startrows:
  2998. MOV RAX, [RBP+rows] ;
  2999. startouter:
  3000. CMP RAX, 0 ;
  3001. JLE endL ;
  3002. MOV RSI, [RBP+ladr] ;
  3003. MOV RDI, [RBP+dadr] ;
  3004. MOV RBX, [RBP+linc] ;
  3005. MOV RCX, [RBP+dstride] ;
  3006. MOV RAX, [RBP+cols] ;
  3007. startinner:
  3008. CMP RAX, 0 ;
  3009. JLE endinner ;
  3010. MOV RDX, [RSI] ;
  3011. MOV [RDI], RDX ;
  3012. ADD RSI, RBX ;
  3013. ADD RDI, RCX ;
  3014. DEC RAX ;
  3015. JMP startinner ;
  3016. endinner:
  3017. MOV RSI, [RBP+ladr] ;
  3018. ADD RSI, [RBP+lstride] ;
  3019. MOV [RBP+ladr], RSI
  3020. MOV RDI, [RBP+dadr] ;
  3021. ADD RDI, [RBP+dinc] ;
  3022. MOV [RBP+dadr], RDI ;
  3023. MOV RAX, [RBP+rows] ;
  3024. DEC RAX ;
  3025. MOV [RBP+rows], RAX ;
  3026. JMP startouter ;
  3027. endL:
  3028. END Transpose4A;
  3029. PROCEDURE Transpose4( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3030. VAR l, d, c: SIZE; BlockSize: SIZE;
  3031. BEGIN
  3032. BlockSize :=
  3033. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3034. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3035. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3036. BlockSize := MAX( 8, BlockSize );
  3037. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3038. WHILE (rows >= BlockSize) DO
  3039. c := cols; l := ladr; d := dadr;
  3040. WHILE (c >= BlockSize) DO
  3041. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3042. BlockSize );
  3043. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3044. INC( d, BlockSize * dstride );
  3045. END;
  3046. IF c > 0 THEN
  3047. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3048. END;
  3049. DEC( rows, BlockSize ); INC( ladr, BlockSize * lstride );
  3050. INC( dadr, BlockSize * dinc );
  3051. END;
  3052. IF (rows > 0) THEN
  3053. c := cols; l := ladr; d := dadr;
  3054. WHILE (c >= BlockSize) DO
  3055. Transpose4A( l, d, lstride, linc, dstride, dinc, rows,
  3056. BlockSize );
  3057. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3058. INC( d, BlockSize * dstride );
  3059. END;
  3060. IF c > 0 THEN
  3061. Transpose4A( l, d, lstride, linc, dstride, dinc, rows, c );
  3062. END;
  3063. END;
  3064. END Transpose4;
  3065. PROCEDURE Transpose8( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3066. VAR l, d, c: SIZE; BlockSize: SIZE;
  3067. BEGIN
  3068. BlockSize :=
  3069. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3070. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3071. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3072. BlockSize := MAX( 8, BlockSize );
  3073. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3074. WHILE (rows >= BlockSize) DO
  3075. c := cols; l := ladr; d := dadr;
  3076. WHILE (c >= BlockSize) DO
  3077. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3078. BlockSize );
  3079. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3080. INC( d, BlockSize * dstride );
  3081. END;
  3082. IF c > 0 THEN
  3083. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3084. END;
  3085. DEC( rows, BlockSize ); INC( ladr, lstride * BlockSize );
  3086. INC( dadr, dinc * BlockSize );
  3087. END;
  3088. IF (rows > 0) THEN
  3089. c := cols; l := ladr; d := dadr;
  3090. WHILE (c >= BlockSize) DO
  3091. Transpose8A( l, d, lstride, linc, dstride, dinc, rows,
  3092. BlockSize );
  3093. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3094. INC( d, BlockSize * dstride );
  3095. END;
  3096. IF c > 0 THEN
  3097. Transpose8A( l, d, lstride, linc, dstride, dinc, rows, c );
  3098. END;
  3099. END;
  3100. END Transpose8;
  3101. PROCEDURE Transpose8A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3102. CODE {SYSTEM.AMD64}
  3103. startrows:
  3104. MOV RAX, [RBP+rows] ;
  3105. startouter:
  3106. CMP RAX, 0 ;
  3107. JLE endL ;
  3108. MOV RSI, [RBP+ladr] ;
  3109. MOV RDI, [RBP+dadr] ;
  3110. MOV RBX, [RBP+linc] ;
  3111. MOV RCX, [RBP+dstride] ;
  3112. MOV RAX, [RBP+cols] ;
  3113. startinner:
  3114. CMP RAX, 0 ;
  3115. JLE endinner ;
  3116. MOV RDX, [RSI] ;
  3117. MOV [RDI], RDX ;
  3118. MOV RDX, [RSI+4] ;
  3119. MOV [RDI+4], RDX ;
  3120. ADD RSI, RBX ;
  3121. ADD RDI, RCX ;
  3122. DEC RAX ;
  3123. JMP startinner ;
  3124. endinner:
  3125. MOV RSI, [RBP+ladr] ;
  3126. ADD RSI, [RBP+lstride] ;
  3127. MOV [RBP+ladr], RSI
  3128. MOV RDI, [RBP+dadr] ;
  3129. ADD RDI, [RBP+dinc] ;
  3130. MOV [RBP+dadr], RDI ;
  3131. MOV RAX, [RBP+rows] ;
  3132. DEC RAX ;
  3133. MOV [RBP+rows], RAX ;
  3134. JMP startouter ;
  3135. endL:
  3136. END Transpose8A;
  3137. PROCEDURE SSEMul24BlockR( VAR CbFirst: SIZE;
  3138. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3139. add: BOOLEAN );
  3140. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3141. MatrixOfResultsSetup:
  3142. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3143. RowOfResultsLoop:
  3144. MOV RBX, 0 ; counter FOR columns IN B-Cb
  3145. DotProductSetup:
  3146. MOV RSI, [RBP+matrixA] ; matrixA
  3147. MOV RDI, [RBP+matrixB] ; matrixB
  3148. LEA RDI, [RDI+RBX*4] ; current position IN matrixB
  3149. XORPS XMM2, XMM2
  3150. XORPS XMM3, XMM3
  3151. XORPS XMM4, XMM4
  3152. XORPS XMM5, XMM5
  3153. XORPS XMM6, XMM6
  3154. XORPS XMM7, XMM7
  3155. MOV RAX, 0 ;
  3156. MOV AL, [RBP+add] ;
  3157. CMP AL, 0 ; add?
  3158. JE DotProductLoop ;
  3159. MOV RAX, [RBP+matrixC] ; matrixC
  3160. LEA RAX, [RAX+RBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3161. MOVUPS XMM2, [RAX]
  3162. MOVUPS XMM3, [RAX+16]
  3163. MOVUPS XMM4, [RAX+32]
  3164. MOVUPS XMM5, [RAX+48]
  3165. MOVUPS XMM6, [RAX+64]
  3166. MOVUPS XMM7, [RAX+80]
  3167. MOV RAX, 0
  3168. DotProductLoop:
  3169. MOV RDX, [RSI+RAX*4]
  3170. SHL RDX, 1
  3171. CMP RDX, 0
  3172. JE SparseEntryEscape
  3173. MOVSS XMM0, [RSI+RAX*4]
  3174. SHUFPS XMM0, XMM0, 0H
  3175. MOVUPS XMM1, [RDI]
  3176. MULPS XMM1, XMM0
  3177. ADDPS XMM2, XMM1
  3178. MOVUPS XMM1, [RDI+16]
  3179. MULPS XMM1, XMM0
  3180. ADDPS XMM3, XMM1
  3181. MOVUPS XMM1, [RDI+32]
  3182. MULPS XMM1, XMM0
  3183. ADDPS XMM4, XMM1
  3184. MOVUPS XMM1, [RDI+48]
  3185. MULPS XMM1, XMM0
  3186. ADDPS XMM5, XMM1
  3187. MOVUPS XMM1, [RDI+64]
  3188. MULPS XMM1, XMM0
  3189. ADDPS XMM6, XMM1
  3190. MOVUPS XMM1, [RDI+80]
  3191. MULPS XMM1, XMM0
  3192. ADDPS XMM7, XMM1
  3193. SparseEntryEscape:
  3194. ADD RDI, [RBP+StrideB] ; StrideB
  3195. INC RAX
  3196. CMP RAX, [RBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3197. JL DotProductLoop
  3198. ; endL DopProductLoop
  3199. MOV RAX, [RBP+matrixC] ; matrixC
  3200. LEA RAX, [RAX+RBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3201. MOVUPS [RAX], XMM2
  3202. MOVUPS [RAX+16], XMM3
  3203. MOVUPS [RAX+32], XMM4
  3204. MOVUPS [RAX+48], XMM5
  3205. MOVUPS [RAX+64], XMM6
  3206. MOVUPS [RAX+80], XMM7
  3207. ADD RBX, 24 ; move over TO next batch OF 24
  3208. MOV RDX, RBX
  3209. ADD RDX, 24
  3210. CMP RDX, [RBP+Cb] ; Cb, check TO see IF row IS complete
  3211. JLE DotProductSetup
  3212. ; endL RowOfResultsLoop
  3213. MOV RAX, [RBP+matrixA] ; matrixA
  3214. ADD RAX, [RBP+StrideA] ; StrideA
  3215. MOV [RBP+matrixA], RAX ; matrixA
  3216. MOV RAX, [RBP+matrixC] ; matrixC
  3217. ADD RAX, [RBP+StrideC] ; StrideC
  3218. MOV [RBP+matrixC], RAX ; matrixC
  3219. INC RCX
  3220. CMP RCX, [RBP+Ra] ; Ra
  3221. JL RowOfResultsLoop
  3222. Done:
  3223. MOV RAX, [RBP+CbFirst] ; CbFirst
  3224. MOV [RAX], RBX ;
  3225. END SSEMul24BlockR;
  3226. (*! might be better to make a 10Block operation and utilize 2 registers for temporary calculations, see article abaout Emmerald*)
  3227. PROCEDURE SSEMul12BlockX( VAR CbFirst: SIZE;
  3228. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC :ADDRESS;
  3229. add: BOOLEAN );
  3230. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3231. MatrixOfResultsSetup:
  3232. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3233. RowOfResultsLoop:
  3234. MOV RBX, 0 ; counter FOR columns IN B-Cb
  3235. DotProductSetup:
  3236. MOV RSI, [RBP+matrixA] ; matrixA
  3237. MOV RDI, [RBP+matrixB] ; matrixB
  3238. LEA RDI, [RDI+RBX*8]
  3239. XORPD XMM2, XMM2
  3240. XORPD XMM3, XMM3
  3241. XORPD XMM4, XMM4
  3242. XORPD XMM5, XMM5
  3243. XORPD XMM6, XMM6
  3244. XORPD XMM7, XMM7
  3245. MOV RAX, 0 ;
  3246. MOV AL, [RBP+add] ;
  3247. CMP AL, 0 ; add?
  3248. JE DotProductLoop ;
  3249. MOV RAX, [RBP+matrixC] ; matrixC
  3250. LEA RAX, [RAX+RBX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3251. MOVUPD XMM2, [RAX]
  3252. MOVUPD XMM3, [RAX+16]
  3253. MOVUPD XMM4, [RAX+32]
  3254. MOVUPD XMM5, [RAX+48]
  3255. MOVUPD XMM6, [RAX+64]
  3256. MOVUPD XMM7, [RAX+80]
  3257. MOV RAX, 0
  3258. DotProductLoop:
  3259. ; MOV RDX, [RSI+RAX*8]
  3260. ; SHL RDX, 1
  3261. ; CMP RDX, 0
  3262. ; JE SparseEntryEscape
  3263. MOVSD XMM0, [RSI+RAX*8]
  3264. SHUFPD XMM0, XMM0, 0H
  3265. MOVUPD XMM1, [RDI]
  3266. MULPD XMM1, XMM0
  3267. ADDPD XMM2, XMM1
  3268. MOVUPD XMM1, [RDI+16]
  3269. MULPD XMM1, XMM0
  3270. ADDPD XMM3, XMM1
  3271. MOVUPD XMM1, [RDI+32]
  3272. MULPD XMM1, XMM0
  3273. ADDPD XMM4, XMM1
  3274. MOVUPD XMM1, [RDI+48]
  3275. MULPD XMM1, XMM0
  3276. ADDPD XMM5, XMM1
  3277. MOVUPD XMM1, [RDI+64]
  3278. MULPD XMM1, XMM0
  3279. ADDPD XMM6, XMM1
  3280. MOVUPD XMM1, [RDI+80]
  3281. MULPD XMM1, XMM0
  3282. ADDPD XMM7, XMM1
  3283. SparseEntryEscape:
  3284. ADD RDI, [RBP+StrideB] ; StrideB
  3285. INC RAX
  3286. CMP RAX, [RBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3287. JL DotProductLoop ; endL DopProductLoop
  3288. MOV RAX , [RBP+matrixC] ; matrixC
  3289. LEA RAX, [RAX+RBX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3290. MOVUPD [RAX], XMM2
  3291. MOVUPD [RAX+16], XMM3
  3292. MOVUPD [RAX+32], XMM4
  3293. MOVUPD [RAX+48], XMM5
  3294. MOVUPD [RAX+64], XMM6
  3295. MOVUPD [RAX+80], XMM7
  3296. ADD RBX, 12 ; move over TO next batch OF 12
  3297. MOV RDX, RBX
  3298. ADD RDX, 12
  3299. CMP RDX, [RBP+Cb] ; Cb, check TO see IF row IS complete
  3300. JLE DotProductSetup ; end RowOfResultsLoop
  3301. MOV RAX , [RBP+matrixA] ; matrixA
  3302. ADD RAX, [RBP+StrideA] ; StrideA
  3303. MOV [RBP+matrixA], RAX ; matrixA
  3304. MOV RAX, [RBP+matrixC] ; matrixC
  3305. ADD RAX, [RBP+StrideC] ; StrideC
  3306. MOV [RBP+matrixC], RAX ; matrixC
  3307. INC RCX
  3308. CMP RCX, [RBP+Ra] ; Ra
  3309. JL RowOfResultsLoop
  3310. Done:
  3311. MOV RAX, [RBP+CbFirst] ; CbFirst
  3312. MOV [RAX], RBX ;
  3313. END SSEMul12BlockX;
  3314. PROCEDURE SSEMul16BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3315. add: BOOLEAN );
  3316. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3317. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3318. DotProductSetup:
  3319. MOV RSI, [RBP+matrixA] ; matrixA
  3320. MOV RDI, [RBP+matrixB] ; matrixB
  3321. MOV RDX, [RBP+CbFrom] ; CbFrom
  3322. LEA RDI, [RDI+RDX*4]
  3323. XORPS XMM2, XMM2
  3324. XORPS XMM3, XMM3
  3325. XORPS XMM4, XMM4
  3326. XORPS XMM5, XMM5
  3327. MOV RAX, 0 ;
  3328. MOV AL, [RBP+add] ;
  3329. CMP AL, 0 ; add?
  3330. JE DotProductLoop ;
  3331. MOV RAX, [RBP+matrixC] ; matrixC
  3332. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally
  3333. MOVUPS XMM2, [RAX]
  3334. MOVUPS XMM3, [RAX+16]
  3335. MOVUPS XMM4, [RAX+32]
  3336. MOVUPS XMM5, [RAX+48]
  3337. MOV RAX, 0
  3338. DotProductLoop:
  3339. MOV RDX, [RSI+RAX*4]
  3340. SHL RDX, 1
  3341. CMP RDX, 0
  3342. JE SparseEntryEscape
  3343. MOVSS XMM0, [RSI+RAX*4]
  3344. SHUFPS XMM0, XMM0, 0H
  3345. MOVUPS XMM1, [RDI]
  3346. MULPS XMM1, XMM0
  3347. ADDPS XMM2, XMM1
  3348. MOVUPS XMM1, [RDI+16]
  3349. MULPS XMM1, XMM0
  3350. ADDPS XMM3, XMM1
  3351. MOVUPS XMM1, [RDI+32]
  3352. MULPS XMM1, XMM0
  3353. ADDPS XMM4, XMM1
  3354. MOVUPS XMM1, [RDI+48]
  3355. MULPS XMM1, XMM0
  3356. ADDPS XMM5, XMM1
  3357. SparseEntryEscape:
  3358. ADD RDI, [RBP+StrideB] ; StrideB
  3359. INC RAX
  3360. CMP RAX, [RBP+Ca] ; Ca
  3361. JL DotProductLoop ; end DotProductLoop
  3362. MOV RAX , [RBP+matrixC] ; matrixC
  3363. MOV RDX, [RBP+CbFrom] ; CbFirst
  3364. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 12
  3365. MOVUPS [RAX], XMM2
  3366. MOVUPS [RAX+16], XMM3
  3367. MOVUPS [RAX+32], XMM4
  3368. MOVUPS [RAX+48], XMM5
  3369. MOV RAX, [RBP+matrixA] ; matrixA
  3370. ADD RAX, [RBP+StrideA] ; StrideA
  3371. MOV [RBP+matrixA], RAX ; matrixA
  3372. MOV RAX, [RBP+matrixC] ; matrixC
  3373. ADD RAX, [RBP+StrideC] ; StrideC
  3374. MOV [RBP+matrixC], RAX ; matrixC
  3375. INC RCX
  3376. CMP RCX, [RBP+Ra] ; Ra
  3377. JL DotProductSetup ;
  3378. END SSEMul16BlockR;
  3379. PROCEDURE SSEMul8BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3380. add: BOOLEAN );
  3381. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3382. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3383. DotProductSetup:
  3384. MOV RSI, [RBP+matrixA] ; matrixA
  3385. MOV RDI, [RBP+matrixB] ; matrixB
  3386. MOV RDX, [RBP+CbFrom] ; CbFrom
  3387. LEA RDI, [RDI+RDX*8]
  3388. XORPD XMM2, XMM2
  3389. XORPD XMM3, XMM3
  3390. XORPD XMM4, XMM4
  3391. XORPD XMM5, XMM5
  3392. MOV RAX, 0 ;
  3393. MOV AL, [RBP+add] ;
  3394. CMP AL, 0 ; add?
  3395. JE DotProductLoop ;
  3396. MOV RAX, [RBP+matrixC] ; matrixC
  3397. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3398. MOVUPD XMM2, [RAX]
  3399. MOVUPD XMM3, [RAX+16]
  3400. MOVUPD XMM4, [RAX+32]
  3401. MOVUPD XMM5, [RAX+48]
  3402. MOV RAX, 0
  3403. DotProductLoop:
  3404. ; MOV RDX, [RSI+RAX*8]
  3405. ; SHL RDX, 1
  3406. ; CMP RDX, 0
  3407. ; JE SparseEntryEscape
  3408. MOVSD XMM0, [RSI+RAX*8]
  3409. SHUFPD XMM0, XMM0, 0H
  3410. MOVUPD XMM1, [RDI]
  3411. MULPD XMM1, XMM0
  3412. ADDPD XMM2, XMM1
  3413. MOVUPD XMM1, [RDI+16]
  3414. MULPD XMM1, XMM0
  3415. ADDPD XMM3, XMM1
  3416. MOVUPD XMM1, [RDI+32]
  3417. MULPD XMM1, XMM0
  3418. ADDPD XMM4, XMM1
  3419. MOVUPD XMM1, [RDI+48]
  3420. MULPD XMM1, XMM0
  3421. ADDPD XMM5, XMM1
  3422. SparseEntryEscape:
  3423. ADD RDI, [RBP+StrideB] ; StrideB
  3424. INC RAX
  3425. CMP RAX, [RBP+Ca] ; Ca
  3426. JL DotProductLoop ; end DotProductLoop
  3427. MOV RAX , [RBP+matrixC] ; matrixC
  3428. MOV RDX, [RBP+CbFrom] ; CbFirst
  3429. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3430. MOVUPD [RAX], XMM2
  3431. MOVUPD [RAX+16], XMM3
  3432. MOVUPD [RAX+32], XMM4
  3433. MOVUPD [RAX+48], XMM5
  3434. MOV RAX, [RBP+matrixA] ; matrixA
  3435. ADD RAX, [RBP+StrideA] ; StrideA
  3436. MOV [RBP+matrixA], RAX ; matrixA
  3437. MOV RAX, [RBP+matrixC] ; matrixC
  3438. ADD RAX, [RBP+StrideC] ; StrideC
  3439. MOV [RBP+matrixC], RAX ; matrixC
  3440. INC RCX
  3441. CMP RCX, [RBP+Ra] ; Ra
  3442. JL DotProductSetup ;
  3443. END SSEMul8BlockX;
  3444. PROCEDURE SSEMul8BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3445. add: BOOLEAN );
  3446. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3447. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3448. DotProductSetup:
  3449. MOV RSI, [RBP+matrixA] ; matrixA
  3450. MOV RDI, [RBP+matrixB] ; matrixB
  3451. MOV RDX, [RBP+CbFrom] ; CbFrom
  3452. LEA RDI, [RDI+RDX*4]
  3453. XORPS XMM2, XMM2
  3454. XORPS XMM3, XMM3
  3455. MOV RAX, 0 ;
  3456. MOV AL, [RBP+add] ;
  3457. CMP AL, 0 ; add?
  3458. JE DotProductLoop ;
  3459. MOV RAX, [RBP+matrixC] ; matrixC
  3460. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  3461. MOVUPS XMM2, [RAX]
  3462. MOVUPS XMM3, [RAX+16]
  3463. MOV RAX, 0
  3464. DotProductLoop:
  3465. MOV RDX, [RSI+RAX*4]
  3466. SHL RDX, 1
  3467. CMP RDX, 0
  3468. JE SparseEntryEscape
  3469. MOVSS XMM0, [RSI+RAX*4]
  3470. SHUFPS XMM0, XMM0, 0H
  3471. MOVUPS XMM1, [RDI]
  3472. MULPS XMM1, XMM0
  3473. ADDPS XMM2, XMM1
  3474. MOVUPS XMM1, [RDI+16]
  3475. MULPS XMM1, XMM0
  3476. ADDPS XMM3, XMM1
  3477. SparseEntryEscape:
  3478. ADD RDI, [RBP+StrideB] ; StrideB
  3479. INC RAX
  3480. CMP RAX, [RBP+Ca] ; Ca
  3481. JL DotProductLoop ; end DotProductLoop
  3482. MOV RAX , [RBP+matrixC] ; matrixC
  3483. MOV RDX, [RBP+CbFrom] ; CbFrom
  3484. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  3485. MOVUPS [RAX], XMM2
  3486. MOVUPS [RAX+16], XMM3
  3487. MOV RAX, [RBP+matrixA] ; matrixA
  3488. ADD RAX, [RBP+StrideA] ; StrideA
  3489. MOV [RBP+matrixA], RAX ; matrixA
  3490. MOV RAX, [RBP+matrixC] ; matrixC
  3491. ADD RAX, [RBP+StrideC] ; StrideC
  3492. MOV [RBP+matrixC], RAX ; matrixC
  3493. INC RCX
  3494. CMP RCX, [RBP+Ra] ; Ra
  3495. JL DotProductSetup ;
  3496. END SSEMul8BlockR;
  3497. PROCEDURE SSEMul4BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3498. add: BOOLEAN );
  3499. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3500. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3501. DotProductSetup:
  3502. MOV RAX, 0 ; cols IN A
  3503. MOV RSI, [RBP+matrixA] ; matrixA
  3504. MOV RDI, [RBP+matrixB] ; matrixB
  3505. MOV RDX, [RBP+CbFrom] ; CbFrom
  3506. LEA RDI, [RDI+RDX*8]
  3507. XORPS XMM2, XMM2
  3508. XORPS XMM3, XMM3
  3509. MOV RAX, 0 ;
  3510. MOV AL, [RBP+add] ;
  3511. CMP AL, 0 ; add?
  3512. JE DotProductLoop ;
  3513. MOV RAX, [RBP+matrixC] ; matrixC
  3514. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3515. MOVUPD XMM2, [RAX]
  3516. MOVUPD XMM3, [RAX+16]
  3517. MOV RAX, 0
  3518. DotProductLoop:
  3519. ; MOV RDX, [RSI+RAX*8]
  3520. ; SHL RDX, 1
  3521. ; CMP RDX, 0
  3522. ; JE SparseEntryEscape
  3523. MOVSD XMM0, [RSI+RAX*8]
  3524. SHUFPD XMM0, XMM0, 0H
  3525. MOVUPD XMM1, [RDI]
  3526. MULPD XMM1, XMM0
  3527. ADDPD XMM2, XMM1
  3528. MOVUPD XMM1, [RDI+16]
  3529. MULPD XMM1, XMM0
  3530. ADDPD XMM3, XMM1
  3531. SparseEntryEscape:
  3532. ADD RDI, [RBP+StrideB] ; StrideB
  3533. INC RAX
  3534. CMP RAX, [RBP+Ca] ; Ca
  3535. JL DotProductLoop ; end DotProductLoop
  3536. MOV RAX , [RBP+matrixC] ; matrixC
  3537. MOV RDX, [RBP+CbFrom] ; CbFrom
  3538. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3539. MOVUPD [RAX], XMM2
  3540. MOVUPD [RAX+16], XMM3
  3541. MOV RAX, [RBP+matrixA] ; matrixA
  3542. ADD RAX, [RBP+StrideA] ; StrideA
  3543. MOV [RBP+matrixA], RAX ; matrixA
  3544. MOV RAX, [RBP+matrixC] ; matrixC
  3545. ADD RAX, [RBP+StrideC] ; StrideC
  3546. MOV [RBP+matrixC], RAX ; matrixC
  3547. INC RCX
  3548. CMP RCX, [RBP+Ra] ; Ra
  3549. JL DotProductSetup ;
  3550. END SSEMul4BlockX;
  3551. PROCEDURE SSEMul4BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3552. add: BOOLEAN );
  3553. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3554. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3555. DotProductSetup:
  3556. MOV RAX, 0 ; cols IN A
  3557. MOV RSI, [RBP+matrixA] ; matrixA
  3558. MOV RDI, [RBP+matrixB] ; matrixB
  3559. MOV RDX, [RBP+CbFrom] ; CbFrom
  3560. LEA RDI, [RDI+RDX*4]
  3561. XORPS XMM2, XMM2
  3562. MOV RAX, 0 ;
  3563. MOV AL, [RBP+add] ;
  3564. CMP AL, 0 ; add?
  3565. JE DotProductLoop ;
  3566. MOV RAX, [RBP+matrixC] ; matrixC
  3567. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  3568. MOVUPS XMM2, [RAX]
  3569. MOV RAX, 0
  3570. DotProductLoop:
  3571. MOV RDX, [RSI+RAX*4]
  3572. SHL RDX, 1
  3573. CMP RDX, 0
  3574. JE SparseEntryEscape
  3575. MOVSS XMM0, [RSI+RAX*4]
  3576. SHUFPS XMM0, XMM0, 0H
  3577. MOVUPS XMM1, [RDI]
  3578. MULPS XMM1, XMM0
  3579. ADDPS XMM2, XMM1
  3580. SparseEntryEscape:
  3581. ADD RDI, [RBP+StrideB] ; StrideB
  3582. INC RAX
  3583. CMP RAX, [RBP+Ca] ; Ca
  3584. JL DotProductLoop ; end DopProductLoop
  3585. MOV RAX, [RBP+matrixC] ; matrixC
  3586. MOV RDX, [RBP+CbFrom] ; CbFrom
  3587. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  3588. MOVUPS [RAX], XMM2
  3589. MOV RAX, [RBP+matrixA] ; matrixA
  3590. ADD RAX, [RBP+StrideA] ; StrideA
  3591. MOV [RBP+matrixA], RAX ; matrixA
  3592. MOV RAX, [RBP+matrixC] ; matrixC
  3593. ADD RAX, [RBP+StrideC] ; StrideC
  3594. MOV [RBP+matrixC], RAX ; matrixC
  3595. INC RCX
  3596. CMP RCX, [RBP+Ra] ; Ra
  3597. JL DotProductSetup ;
  3598. END SSEMul4BlockR;
  3599. PROCEDURE SSEMul2BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3600. add: BOOLEAN );
  3601. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3602. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3603. DotProductSetup:
  3604. MOV RAX, 0 ; cols IN A
  3605. MOV RSI, [RBP+matrixA] ; matrixA
  3606. MOV RDI, [RBP+matrixB] ; matrixB
  3607. MOV RDX, [RBP+CbFrom] ; CbFrom
  3608. LEA RDI, [RDI+RDX*8]
  3609. XORPD XMM2, XMM2
  3610. MOV RAX, 0 ;
  3611. MOV AL, [RBP+add] ;
  3612. CMP AL, 0 ; add?
  3613. JE DotProductLoop ;
  3614. MOV RAX, [RBP+matrixC] ; matrixC
  3615. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3616. MOVUPD XMM2, [RAX]
  3617. MOV RAX, 0
  3618. DotProductLoop:
  3619. ; MOV RDX, [RSI+RAX*4] ;
  3620. ; SHL RDX, 1 ;
  3621. ; CMP RDX, 0
  3622. ; JE SparseEntryEscape
  3623. MOVSD XMM0, [RSI+RAX*8]
  3624. SHUFPD XMM0, XMM0, 0H
  3625. MOVUPD XMM1, [RDI]
  3626. MULPD XMM1, XMM0
  3627. ADDPD XMM2, XMM1
  3628. SparseEntryEscape:
  3629. ADD RDI, [RBP+StrideB] ; StrideB
  3630. INC RAX
  3631. CMP RAX, [RBP+Ca] ; Ca
  3632. JL DotProductLoop ; end DotProductLoop
  3633. MOV RAX , [RBP+matrixC] ; matrixC
  3634. MOV RDX, [RBP+CbFrom] ; CbFrom
  3635. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3636. MOVUPD [RAX], XMM2
  3637. MOV RAX, [RBP+matrixA] ; matrixA
  3638. ADD RAX, [RBP+StrideA] ; StrideA
  3639. MOV [RBP+matrixA], RAX ; matrixA
  3640. MOV RAX, [RBP+matrixC] ; matrixC
  3641. ADD RAX, [RBP+StrideC] ; StrideC
  3642. MOV [RBP+matrixC], RAX ; matrixC
  3643. INC RCX
  3644. CMP RCX, [RBP+Ra] ; Ra
  3645. JL DotProductSetup ;
  3646. END SSEMul2BlockX;
  3647. (****** blocking matrix multiplication with copy of data ******)
  3648. PROCEDURE MagicBlockR( M, N, K: SIZE;
  3649. VAR L2BlockM, L2BlockN, L2BlockK: SIZE );
  3650. BEGIN
  3651. K := (K DIV L0BlockKR) * L0BlockKR;
  3652. N := (N DIV L1BlockN) * L1BlockN;
  3653. IF M = 0 THEN M := 1 END;
  3654. IF N = 0 THEN N := 1 END;
  3655. IF K = 0 THEN K := 1 END;
  3656. L2BlockK :=
  3657. K DIV ((K + L1MaxBlockKR - 1) DIV L1MaxBlockKR);
  3658. (* Round up to next multiple of 16 *)
  3659. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  3660. L2BlockN :=
  3661. L2BlockSize DIV SIZEOF( REAL ) DIV
  3662. (L2BlockK * (L2BARatio + 1));
  3663. IF L2BlockN > N THEN L2BlockN := N
  3664. ELSIF L2BlockN < 1 THEN L2BlockN := 1;
  3665. END;
  3666. L2BlockM :=
  3667. (L2BlockSize DIV SIZEOF( REAL ) - L2BlockN * L2BlockK) DIV
  3668. L2BlockK;
  3669. (* Round up to next multiple of 5 *)
  3670. IF L2BlockM > M THEN L2BlockM := M
  3671. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  3672. END;
  3673. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  3674. END MagicBlockR;
  3675. PROCEDURE MagicBlockX( M, N, K: SIZE;
  3676. VAR L2BlockM, L2BlockN, L2BlockK:SIZE );
  3677. BEGIN
  3678. K := (K DIV L0BlockKX) * L0BlockKX;
  3679. N := (N DIV L1BlockN) * L1BlockN;
  3680. IF M = 0 THEN M := 1 END;
  3681. IF N = 0 THEN N := 1 END;
  3682. IF K = 0 THEN K := 1 END;
  3683. L2BlockK :=
  3684. K DIV ((K + L1MaxBlockKX - 1) DIV L1MaxBlockKX);
  3685. (* Round up to next multiple of 16 *)
  3686. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  3687. L2BlockN :=
  3688. L2BlockSize DIV SIZEOF( LONGREAL ) DIV
  3689. (L2BlockK * (L2BARatio + 1));
  3690. IF L2BlockN > N THEN L2BlockN := N END;
  3691. L2BlockM :=
  3692. (L2BlockSize DIV SIZEOF( LONGREAL ) - L2BlockN * L2BlockK) DIV
  3693. L2BlockK;
  3694. (* Round up to next multiple of 5 *)
  3695. IF L2BlockM > M THEN L2BlockM := M
  3696. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  3697. END;
  3698. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  3699. END MagicBlockX;
  3700. (*
  3701. PROCEDURE L1Block1X( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3702. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  3703. PROCEDURE null( i: LONGINT );
  3704. BEGIN
  3705. reg[i, 0] := 0; reg[i, 1] := 0;
  3706. END null;
  3707. PROCEDURE get1( adr, i: LONGINT );
  3708. BEGIN
  3709. SYSTEM.GET( adr, reg[i, 0] );
  3710. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3711. END get1;
  3712. PROCEDURE get2( adr, i: LONGINT );
  3713. BEGIN
  3714. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  3715. IF debug THEN
  3716. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3717. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  3718. END;
  3719. END get2;
  3720. PROCEDURE mul2( i, j: LONGINT );
  3721. BEGIN
  3722. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3723. END mul2;
  3724. PROCEDURE add2( i, j: LONGINT );
  3725. BEGIN
  3726. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3727. END add2;
  3728. PROCEDURE put1( adr, i: LONGINT );
  3729. BEGIN
  3730. SYSTEM.PUT( adr, reg[i, 0] );
  3731. END put1;
  3732. PROCEDURE horadd( i: LONGINT );
  3733. BEGIN
  3734. reg[i, 0] := reg[i, 0] + reg[i, 1];
  3735. END horadd;
  3736. BEGIN
  3737. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  3738. null( 2 ); get1( adrC, 2 );
  3739. WHILE (K > 0) DO (* padding guaranteed *)
  3740. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 ); INC( adrB, 16 );
  3741. INC( adrA, 16 ); DEC( K, 2 );
  3742. END;
  3743. horadd( 2 ); put1( adrC, 2 );
  3744. END L1Block1X;
  3745. PROCEDURE L1Block5X( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3746. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  3747. PROCEDURE null( i: LONGINT );
  3748. BEGIN
  3749. reg[i, 0] := 0; reg[i, 1] := 0;
  3750. END null;
  3751. PROCEDURE get1( adr, i: LONGINT );
  3752. BEGIN
  3753. SYSTEM.GET( adr, reg[i, 0] );
  3754. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3755. END get1;
  3756. PROCEDURE get2( adr, i: LONGINT );
  3757. BEGIN
  3758. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  3759. IF debug THEN
  3760. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3761. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  3762. END;
  3763. END get2;
  3764. PROCEDURE mul2( i, j: LONGINT );
  3765. BEGIN
  3766. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3767. END mul2;
  3768. PROCEDURE add2( i, j: LONGINT );
  3769. BEGIN
  3770. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3771. END add2;
  3772. PROCEDURE put1( adr, i: LONGINT );
  3773. BEGIN
  3774. SYSTEM.PUT( adr, reg[i, 0] );
  3775. END put1;
  3776. PROCEDURE horadd( i: LONGINT );
  3777. BEGIN
  3778. reg[i, 0] := reg[i, 0] + reg[i, 1];
  3779. END horadd;
  3780. BEGIN
  3781. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  3782. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  3783. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  3784. get1( adrC + 4 * IncC, 6 );
  3785. WHILE (K > 0) DO (* padding guaranteed *)
  3786. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 );
  3787. get2( adrB + 16, 0 ); mul2( 0, 7 ); add2( 3, 0 ); get2( adrB + 32, 0 );
  3788. mul2( 0, 7 ); add2( 4, 0 ); get2( adrB + 48, 0 ); mul2( 0, 7 );
  3789. add2( 5, 0 ); get2( adrB + 64, 0 ); mul2( 0, 7 ); add2( 6, 0 ); INC( adrB, 80 );
  3790. INC( adrA, 16 ); DEC( K, 2 );
  3791. END;
  3792. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  3793. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  3794. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  3795. END L1Block5X;
  3796. PROCEDURE L1Block1R( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3797. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  3798. PROCEDURE null( i: LONGINT );
  3799. BEGIN
  3800. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  3801. END null;
  3802. PROCEDURE get1( adr, i: LONGINT );
  3803. BEGIN
  3804. SYSTEM.GET( adr, reg[i, 0] );
  3805. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3806. END get1;
  3807. PROCEDURE get4( adr, i: LONGINT );
  3808. BEGIN
  3809. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  3810. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  3811. IF debug THEN
  3812. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3813. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  3814. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  3815. END;
  3816. END get4;
  3817. PROCEDURE mul4( i, j: LONGINT );
  3818. BEGIN
  3819. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3820. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  3821. END mul4;
  3822. PROCEDURE add4( i, j: LONGINT );
  3823. BEGIN
  3824. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3825. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  3826. END add4;
  3827. PROCEDURE put1( adr, i: LONGINT );
  3828. BEGIN
  3829. SYSTEM.PUT( adr, reg[i, 0] );
  3830. END put1;
  3831. PROCEDURE horadd( i: LONGINT );
  3832. BEGIN
  3833. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  3834. END horadd;
  3835. BEGIN
  3836. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  3837. null( 2 ); get1( adrC, 2 );
  3838. WHILE (K > 0) DO (* padding guaranteed *)
  3839. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 ); INC( adrB, 16 );
  3840. INC( adrA, 16 ); DEC( K, 4 );
  3841. END;
  3842. horadd( 2 ); put1( adrC, 2 );
  3843. END L1Block1R;
  3844. PROCEDURE L1Block5R( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3845. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  3846. PROCEDURE null( i: LONGINT );
  3847. BEGIN
  3848. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  3849. END null;
  3850. PROCEDURE get1( adr, i: LONGINT );
  3851. BEGIN
  3852. SYSTEM.GET( adr, reg[i, 0] );
  3853. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3854. END get1;
  3855. PROCEDURE get4( adr, i: LONGINT );
  3856. BEGIN
  3857. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  3858. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  3859. IF debug THEN
  3860. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3861. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  3862. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  3863. END;
  3864. END get4;
  3865. PROCEDURE mul4( i, j: LONGINT );
  3866. BEGIN
  3867. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3868. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  3869. END mul4;
  3870. PROCEDURE add4( i, j: LONGINT );
  3871. BEGIN
  3872. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3873. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  3874. END add4;
  3875. PROCEDURE put1( adr, i: LONGINT );
  3876. BEGIN
  3877. SYSTEM.PUT( adr, reg[i, 0] );
  3878. END put1;
  3879. PROCEDURE horadd( i: LONGINT );
  3880. BEGIN
  3881. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  3882. END horadd;
  3883. BEGIN
  3884. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  3885. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  3886. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  3887. get1( adrC + 4 * IncC, 6 );
  3888. WHILE (K > 0) DO (* padding guaranteed *)
  3889. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 );
  3890. get4( adrB + 16, 0 ); mul4( 0, 7 ); add4( 3, 0 ); get4( adrB + 32, 0 );
  3891. mul4( 0, 7 ); add4( 4, 0 ); get4( adrB + 48, 0 ); mul4( 0, 7 );
  3892. add4( 5, 0 ); get4( adrB + 64, 0 ); mul4( 0, 7 ); add4( 6, 0 ); INC( adrB, 80 );
  3893. INC( adrA, 16 ); DEC( K, 4 );
  3894. END;
  3895. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  3896. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  3897. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  3898. END L1Block5R;
  3899. *)
  3900. PROCEDURE DispCR( adrM: ADDRESS;
  3901. inc, stride, M, N: SIZE );
  3902. VAR i, j: SIZE; adr: ADDRESS; val: REAL;
  3903. BEGIN
  3904. FOR i := 0 TO M - 1 DO
  3905. adr := adrM + i * stride;
  3906. FOR j := 0 TO N - 1 DO
  3907. SYSTEM.GET( adr, val );
  3908. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  3909. END;
  3910. KernelLog.Ln;
  3911. END;
  3912. END DispCR;
  3913. PROCEDURE DispCX( adrM: ADDRESS;
  3914. inc, stride, M, N: SIZE );
  3915. VAR i, j: SIZE; adr: ADDRESS; val: LONGREAL;
  3916. BEGIN
  3917. FOR i := 0 TO M - 1 DO
  3918. adr := adrM + i * stride;
  3919. FOR j := 0 TO N - 1 DO
  3920. SYSTEM.GET( adr, val );
  3921. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  3922. END;
  3923. KernelLog.Ln;
  3924. END;
  3925. END DispCX;
  3926. PROCEDURE L3BlockX( matrixA, matrixB, matrixC: ADDRESS;
  3927. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  3928. (*
  3929. K N
  3930. *** N *****
  3931. M *** ****** -> ***** M
  3932. *** K ****** *****
  3933. *** ****** *****
  3934. A * B -> C
  3935. *)
  3936. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  3937. KAligned: SIZE;
  3938. CONST Size = SIZEOF( LONGREAL );
  3939. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  3940. (* M,N and K arbitrary ! *)
  3941. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  3942. m, k, KAligned: SIZE;
  3943. BEGIN
  3944. KAligned := Align2( K ) * 8;
  3945. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  3946. END;
  3947. adrB := matrixB;
  3948. WHILE (N >= L1BlockN) DO
  3949. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  3950. adrC := matrixC; adrA := matrixA; m := M;
  3951. WHILE (m > 0) DO
  3952. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  3953. IF SSE THEN
  3954. L1Block5XSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  3955. ELSE
  3956. aadrA := adrA; aadrB := adrB; k := K;
  3957. WHILE (k > 0) DO
  3958. L1Block1XA( aadrA, aadrB, adrC, 2 );
  3959. L1Block1XA( aadrA, aadrB + 16, adrC + incC, 2 );
  3960. L1Block1XA( aadrA, aadrB + 32, adrC + 2 * incC,
  3961. 2 );
  3962. L1Block1XA( aadrA, aadrB + 48, adrC + 3 * incC,
  3963. 2 );
  3964. L1Block1XA( aadrA, aadrB + 64, adrC + 4 * incC,
  3965. 2 );
  3966. DEC( k, 2 ); INC( aadrA, 16 );
  3967. INC( aadrB, 16 * L1BlockN );
  3968. END;
  3969. END;
  3970. IF debug THEN
  3971. DispCX( matrixC, incC, strideC, M, N );
  3972. END;
  3973. INC( adrA, KAligned ); INC( adrC, strideC );
  3974. DEC( m );
  3975. END;
  3976. INC( matrixC, L1BlockN * incC );
  3977. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  3978. END;
  3979. WHILE (N > 0) DO
  3980. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  3981. adrC := matrixC; adrA := matrixA; m := M;
  3982. WHILE (m > 0) DO
  3983. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  3984. IF SSE THEN
  3985. L1Block1XSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  3986. ELSE L1Block1XA( adrA, adrB, adrC, K );
  3987. END;
  3988. IF debug THEN
  3989. DispCX( matrixC, incC, strideC, M, N );
  3990. END;
  3991. INC( adrA, KAligned ); INC( adrC, strideC );
  3992. DEC( m );
  3993. END;
  3994. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  3995. END;
  3996. END L2Block;
  3997. BEGIN
  3998. KAligned := Align2( K ) * 8;
  3999. ASSERT( L2BlockK MOD 2 = 0 );
  4000. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4001. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4002. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4003. WHILE (n >= L2BlockN) DO
  4004. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4005. a1 := matrixA; adrC := matrixC; m := M;
  4006. WHILE (m >= L2BlockM) DO
  4007. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4008. adrA := a1; adrB := b1; k := K;
  4009. (* core: do matching level 2 cache Blocks *)
  4010. WHILE (k >= L2BlockK) DO
  4011. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4012. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4013. L2BlockK );
  4014. INC( adrA, L2BlockK * L2BlockM * Size );
  4015. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4016. DEC( k, L2BlockK );
  4017. END;
  4018. (* core: do rest of k *)
  4019. IF k > 0 THEN
  4020. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4021. END;
  4022. INC( a1, KAligned * L2BlockM );
  4023. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4024. END;
  4025. IF m > 0 THEN
  4026. (* clean up M *)
  4027. adrA := a1; adrB := b1; k := K;
  4028. WHILE (k >= L2BlockK) DO
  4029. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4030. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4031. INC( adrA, L2BlockK * Size * m );
  4032. INC( adrB, L2BlockK * L2BlockN * Size );
  4033. DEC( k, L2BlockK );
  4034. END;
  4035. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4036. IF k > 0 THEN
  4037. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4038. END;
  4039. END;
  4040. INC( b1, L2BlockN * KAligned );
  4041. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4042. END;
  4043. IF (n = 0) THEN RETURN
  4044. END;
  4045. a1 := matrixA; adrC := matrixC; m := M;
  4046. WHILE (m >= L2BlockM) DO
  4047. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4048. adrA := a1; adrB := b1; k := K;
  4049. WHILE (k >= L2BlockK) DO
  4050. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4051. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4052. INC( adrA, L2BlockM * L2BlockK * Size );
  4053. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4054. END;
  4055. IF k > 0 THEN
  4056. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4057. END;
  4058. INC( a1, L2BlockM * KAligned );
  4059. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4060. END;
  4061. IF (m = 0) THEN RETURN
  4062. END;
  4063. adrA := a1; adrB := b1; k := K;
  4064. WHILE (k >= L2BlockK) DO
  4065. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4066. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4067. INC( adrA, L2BlockK * m * Size );
  4068. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4069. END;
  4070. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4071. END;
  4072. END L3BlockX;
  4073. PROCEDURE L3BlockR( matrixA, matrixB, matrixC: ADDRESS;
  4074. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4075. (*
  4076. K N
  4077. *** N *****
  4078. M *** ****** -> ***** M
  4079. *** K ****** *****
  4080. *** ****** *****
  4081. A * B -> C
  4082. *)
  4083. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4084. KAligned: SIZE;
  4085. CONST Size = SIZEOF( REAL );
  4086. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4087. (* M,N and K arbitrary ! *)
  4088. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4089. m, KAligned, k: SIZE;
  4090. BEGIN
  4091. KAligned := Align4( K ) * 4;
  4092. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4093. END;
  4094. adrB := matrixB;
  4095. WHILE (N >= L1BlockN) DO
  4096. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4097. adrC := matrixC; adrA := matrixA; m := M;
  4098. WHILE (m > 0) DO
  4099. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4100. IF SSE THEN
  4101. L1Block5RSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4102. ELSE
  4103. aadrA := adrA; aadrB := adrB; k := K;
  4104. WHILE (k > 0) DO
  4105. L1Block1RA( aadrA, aadrB, adrC, 4 );
  4106. L1Block1RA( aadrA, aadrB + 16, adrC + incC, 4 );
  4107. L1Block1RA( aadrA, aadrB + 32, adrC + 2 * incC,
  4108. 4 );
  4109. L1Block1RA( aadrA, aadrB + 48, adrC + 3 * incC,
  4110. 4 );
  4111. L1Block1RA( aadrA, aadrB + 64, adrC + 4 * incC,
  4112. 4 );
  4113. DEC( k, 4 ); INC( aadrA, 16 );
  4114. INC( aadrB, 16 * L1BlockN );
  4115. END;
  4116. END;
  4117. IF debug THEN
  4118. DispCR( matrixC, incC, strideC, M, N );
  4119. END;
  4120. INC( adrA, KAligned ); INC( adrC, strideC );
  4121. DEC( m );
  4122. END;
  4123. INC( matrixC, L1BlockN * incC );
  4124. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4125. END;
  4126. WHILE (N > 0) DO
  4127. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4128. adrC := matrixC; adrA := matrixA; m := M;
  4129. WHILE (m > 0) DO
  4130. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4131. IF SSE THEN
  4132. L1Block1RSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4133. ELSE L1Block1RA( adrA, adrB, adrC, K );
  4134. END;
  4135. IF debug THEN
  4136. DispCR( matrixC, incC, strideC, M, N );
  4137. END;
  4138. INC( adrA, KAligned ); INC( adrC, strideC );
  4139. DEC( m );
  4140. END;
  4141. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4142. END;
  4143. END L2Block;
  4144. BEGIN
  4145. KAligned := Align4( K ) * 4;
  4146. ASSERT( L2BlockK MOD 4 = 0 );
  4147. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4148. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4149. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4150. WHILE (n >= L2BlockN) DO
  4151. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4152. a1 := matrixA; adrC := matrixC; m := M;
  4153. WHILE (m >= L2BlockM) DO
  4154. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4155. adrA := a1; adrB := b1; k := K;
  4156. (* core: do matching level 2 cache Blocks *)
  4157. WHILE (k >= L2BlockK) DO
  4158. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4159. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4160. L2BlockK );
  4161. INC( adrA, L2BlockK * L2BlockM * Size );
  4162. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4163. DEC( k, L2BlockK );
  4164. END;
  4165. (* core: do rest of k *)
  4166. IF k > 0 THEN
  4167. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4168. END;
  4169. INC( a1, KAligned * L2BlockM );
  4170. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4171. END;
  4172. IF m > 0 THEN
  4173. (* clean up M *)
  4174. adrA := a1; adrB := b1; k := K;
  4175. WHILE (k >= L2BlockK) DO
  4176. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4177. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4178. INC( adrA, L2BlockK * Size * m );
  4179. INC( adrB, L2BlockK * L2BlockN * Size );
  4180. DEC( k, L2BlockK );
  4181. END;
  4182. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4183. IF k > 0 THEN
  4184. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4185. END;
  4186. END;
  4187. INC( b1, L2BlockN * KAligned );
  4188. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4189. END;
  4190. IF (n = 0) THEN RETURN
  4191. END;
  4192. a1 := matrixA; adrC := matrixC; m := M;
  4193. WHILE (m >= L2BlockM) DO
  4194. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4195. adrA := a1; adrB := b1; k := K;
  4196. WHILE (k >= L2BlockK) DO
  4197. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4198. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4199. INC( adrA, L2BlockM * L2BlockK * Size );
  4200. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4201. END;
  4202. IF k > 0 THEN
  4203. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4204. END;
  4205. INC( a1, L2BlockM * KAligned );
  4206. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4207. END;
  4208. IF (m = 0) THEN RETURN
  4209. END;
  4210. adrA := a1; adrB := b1; k := K;
  4211. WHILE (k >= L2BlockK) DO
  4212. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4213. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4214. INC( adrA, L2BlockK * m * Size );
  4215. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4216. END;
  4217. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4218. END;
  4219. END L3BlockR;
  4220. PROCEDURE Align( adr: ADDRESS; align: SIZE ): ADDRESS;
  4221. BEGIN
  4222. RETURN adr + (-adr) MOD align; (* 128 bit = 16 byte alignment *)
  4223. END Align;
  4224. PROCEDURE CopyAX( matrixA, dest: ADDRESS;
  4225. IncA, StrideA: SIZE;
  4226. K, M, L2BlockK, L2BlockM: SIZE );
  4227. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4228. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4229. VAR rest: SIZE;
  4230. BEGIN
  4231. IF debug THEN
  4232. KernelLog.String( "CopyMK:" ); KernelLog.Int( M, 10 ); KernelLog.Int( K, 10 );
  4233. KernelLog.Ln;
  4234. END;
  4235. rest := (-K) MOD 2;
  4236. WHILE (M > 0) DO
  4237. MovX( matrixA, dest, IncA, K ); INC( dest, K * 8 );
  4238. IF rest # 0 THEN
  4239. ZeroX( dest, rest ); INC( dest, 8 * rest );
  4240. END;
  4241. INC( matrixA, StrideA ); DEC( M );
  4242. END;
  4243. END CopyMK;
  4244. BEGIN
  4245. Tic( t ); m := M;
  4246. WHILE (m >= L2BlockM) DO
  4247. k := K; adrA := matrixA;
  4248. WHILE (k >= L2BlockK) DO
  4249. CopyMK( adrA, L2BlockM, L2BlockK );
  4250. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4251. END;
  4252. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4253. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4254. END;
  4255. adrA := matrixA; k := K;
  4256. WHILE (k >= L2BlockK) DO
  4257. CopyMK( adrA, m, L2BlockK );
  4258. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4259. END;
  4260. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4261. Toc( t, copyT );
  4262. END CopyAX;
  4263. PROCEDURE CopyAR( matrixA, dest: ADDRESS;
  4264. IncA, StrideA: SIZE;
  4265. K, M, L2BlockK, L2BlockM: SIZE );
  4266. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4267. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4268. VAR rest: SIZE;
  4269. BEGIN
  4270. rest := (-K) MOD 4;
  4271. WHILE (M > 0) DO
  4272. MovR( matrixA, dest, IncA, K ); INC( dest, K * 4 );
  4273. IF rest # 0 THEN
  4274. ZeroR( dest, rest ); INC( dest, 4 * rest );
  4275. END;
  4276. INC( matrixA, StrideA ); DEC( M );
  4277. END;
  4278. END CopyMK;
  4279. BEGIN
  4280. Tic( t ); m := M;
  4281. WHILE (m >= L2BlockM) DO
  4282. k := K; adrA := matrixA;
  4283. WHILE (k >= L2BlockK) DO
  4284. CopyMK( adrA, L2BlockM, L2BlockK );
  4285. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4286. END;
  4287. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4288. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4289. END;
  4290. adrA := matrixA; k := K;
  4291. WHILE (k >= L2BlockK) DO
  4292. CopyMK( adrA, m, L2BlockK );
  4293. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4294. END;
  4295. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4296. Toc( t, copyT );
  4297. END CopyAR;
  4298. PROCEDURE CopyBX( matrixB, dest: ADDRESS;
  4299. IncB, StrideB: SIZE;
  4300. N, K, L2BlockN, L2BlockK: SIZE );
  4301. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4302. PROCEDURE Copy5x2k( matrixB: ADDRESS; k: SIZE );
  4303. VAR i: SIZE; adrB: ADDRESS; rest: SIZE;
  4304. BEGIN
  4305. rest := (-k) MOD 2;
  4306. WHILE (k >= 2) DO (* store 5x4 Block in line *)
  4307. adrB := matrixB;
  4308. FOR i := 1 TO L1BlockN DO
  4309. MovX( adrB, dest, StrideB, 2 ); INC( dest, 16 );
  4310. INC( adrB, IncB );
  4311. END;
  4312. INC( matrixB, 2 * StrideB ); DEC( k, 2 );
  4313. END;
  4314. IF k > 0 THEN
  4315. adrB := matrixB;
  4316. FOR i := 1 TO L1BlockN DO
  4317. MovX( adrB, dest, StrideB, k ); INC( dest, 8 * k );
  4318. IF rest # 0 THEN
  4319. ZeroX( dest, rest ); INC( dest, rest * 8 );
  4320. END;
  4321. INC( adrB, IncB );
  4322. END;
  4323. END;
  4324. END Copy5x2k;
  4325. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4326. VAR n, rest: SIZE;
  4327. BEGIN
  4328. rest := (-K) MOD 2;
  4329. IF debug THEN
  4330. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4331. END;
  4332. n := N;
  4333. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  4334. Copy5x2k( matrixB, K );
  4335. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  4336. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  4337. END;
  4338. IF debug THEN KernelLog.String( "Copy1, n=" ); KernelLog.Int( n, 10 ); KernelLog.Ln;
  4339. END;
  4340. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  4341. MovX( matrixB, dest, StrideB, K ); INC( dest, K * 8 );
  4342. ZeroR( dest, rest ); INC( dest, rest * 8 );
  4343. INC( matrixB, IncB ); DEC( n );
  4344. END;
  4345. END Copy1;
  4346. BEGIN
  4347. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  4348. ASSERT( L2BlockK MOD 2 = 0 ); n := N;
  4349. WHILE (n >= L2BlockN) DO
  4350. k := K; adrB := matrixB;
  4351. WHILE (k >= L2BlockK) DO
  4352. Copy1( adrB, L2BlockK, L2BlockN );
  4353. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4354. END;
  4355. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  4356. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  4357. END;
  4358. IF (n = 0) THEN RETURN
  4359. END;
  4360. k := K; adrB := matrixB;
  4361. WHILE (k >= L2BlockK) DO
  4362. Copy1( adrB, L2BlockK, n );
  4363. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4364. END;
  4365. Copy1( adrB, k, n ); Toc( t, copyT );
  4366. END CopyBX;
  4367. PROCEDURE CopyBR( matrixB, dest: ADDRESS;
  4368. IncB, StrideB: SIZE;
  4369. N, K, L2BlockN, L2BlockK: SIZE );
  4370. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4371. PROCEDURE Copy5x4k( matrixB: ADDRESS; k: SIZE );
  4372. VAR i: SIZE; adrB: ADDRESS; rest, k4: SIZE;
  4373. BEGIN
  4374. k4 := k - k MOD 4; rest := (-k) MOD 4;
  4375. IF k4 > 0 THEN
  4376. MovR5( matrixB, IncB, StrideB, dest, k4 );
  4377. INC( matrixB, k4 * StrideB ); INC( dest, k4 * 80 DIV 4 );
  4378. DEC( k, k4 );
  4379. END;
  4380. (*
  4381. WHILE (k >= 4) DO (* store 5x4 Block in line *)
  4382. adrB := matrixB;
  4383. FOR i := 1 TO L1BlockN DO
  4384. MovR( adrB, dest, StrideB, 4 ); INC( dest, 16 ); INC( adrB, IncB );
  4385. END;
  4386. INC( matrixB, 4 * StrideB ); DEC( k, 4 );
  4387. END;
  4388. *)
  4389. IF k > 0 THEN
  4390. adrB := matrixB;
  4391. FOR i := 1 TO L1BlockN DO
  4392. MovR( adrB, dest, StrideB, k ); INC( dest, 4 * k );
  4393. IF rest # 0 THEN
  4394. ZeroR( dest, rest ); INC( dest, rest * 4 );
  4395. END;
  4396. INC( adrB, IncB );
  4397. END;
  4398. END;
  4399. END Copy5x4k;
  4400. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4401. VAR n, rest: SIZE;
  4402. BEGIN
  4403. rest := (-K) MOD 4;
  4404. IF debug THEN
  4405. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4406. END;
  4407. n := N;
  4408. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  4409. Copy5x4k( matrixB, K );
  4410. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  4411. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  4412. END;
  4413. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  4414. MovR( matrixB, dest, StrideB, K ); INC( dest, K * 4 );
  4415. ZeroR( dest, rest ); INC( dest, rest * 4 );
  4416. INC( matrixB, IncB ); DEC( n );
  4417. END;
  4418. END Copy1;
  4419. BEGIN
  4420. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  4421. ASSERT( L2BlockK MOD 4 = 0 ); n := N;
  4422. WHILE (n >= L2BlockN) DO
  4423. k := K; adrB := matrixB;
  4424. WHILE (k >= L2BlockK) DO
  4425. Copy1( adrB, L2BlockK, L2BlockN );
  4426. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4427. END;
  4428. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  4429. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  4430. END;
  4431. IF (n = 0) THEN RETURN
  4432. END;
  4433. k := K; adrB := matrixB;
  4434. WHILE (k >= L2BlockK) DO
  4435. Copy1( adrB, L2BlockK, n );
  4436. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4437. END;
  4438. Copy1( adrB, k, n ); Toc( t, copyT );
  4439. END CopyBR;
  4440. (*
  4441. PROCEDURE FillMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  4442. VAR i, j: LONGINT;
  4443. BEGIN
  4444. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4445. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  4446. A[i, j] := ran.Dice( 10 );
  4447. IF debug THEN A[i, j] := 10 * i + j; END;
  4448. END;
  4449. END;
  4450. END FillMR;
  4451. PROCEDURE DispMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  4452. VAR i, j: LONGINT;
  4453. BEGIN
  4454. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4455. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  4456. KernelLog.Ln;
  4457. END;
  4458. END DispMR;
  4459. PROCEDURE FillMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  4460. VAR i, j: LONGINT;
  4461. BEGIN
  4462. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4463. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  4464. A[i, j] := ran.Dice( 10 );
  4465. IF debug THEN A[i, j] := 10 * i + j; END;
  4466. END;
  4467. END;
  4468. END FillMX;
  4469. PROCEDURE DispMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  4470. VAR i, j: LONGINT;
  4471. BEGIN
  4472. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4473. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  4474. KernelLog.Ln;
  4475. END;
  4476. END DispMX;
  4477. *)
  4478. PROCEDURE -GetTimer( ): HUGEINT;
  4479. CODE {SYSTEM.AMD64}
  4480. CPUID ;
  4481. XOR RAX, RAX
  4482. RDTSC
  4483. SHL RDX, 32
  4484. OR RAX, RDX
  4485. END GetTimer;
  4486. PROCEDURE Tic( VAR t: HUGEINT );
  4487. BEGIN
  4488. t := GetTimer();
  4489. END Tic;
  4490. PROCEDURE Toc( VAR t, addto: HUGEINT );
  4491. BEGIN
  4492. INC( addto, GetTimer() - t ); t := GetTimer();
  4493. END Toc;
  4494. PROCEDURE MultiplyX( A, B, C: ADDRESS;
  4495. M, N, K, L2BlockM, L2BlockN, L2BlockK:SIZE;
  4496. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  4497. add: BOOLEAN );
  4498. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  4499. M1, M2, i: SIZE; val: LONGREAL; t: HUGEINT;
  4500. inc: SIZE;
  4501. obj: POINTER TO ARRAY OF MultiplyObjectX;
  4502. cache: Cache;
  4503. BEGIN
  4504. NEW(obj,nrProcesses+1);
  4505. lenA := M * Align2( K ) * 8; lenB := N * Align2( K ) * 8;
  4506. cache := cachePool.Acquire( lenA + lenB );
  4507. adrA := cache.adr; adrB := adrA + lenA;
  4508. CopyAX( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  4509. CopyBX( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  4510. Tic( t ); m := M; adrC := C;
  4511. IF ~add THEN
  4512. WHILE (m > 0) DO
  4513. ZeroXI( adrC, IncC, N ); INC( adrC, StrideC ); DEC( m );
  4514. END;
  4515. END;
  4516. Toc( t, zeroT );
  4517. IF debug THEN
  4518. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  4519. FOR i := 0 TO M * Align2( K ) - 1 DO
  4520. SYSTEM.GET( adrA + i * 8, val );
  4521. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4522. END;
  4523. END;
  4524. IF debug THEN
  4525. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  4526. FOR i := 0 TO N * Align2( K ) - 1 DO
  4527. SYSTEM.GET( adrB + i * 8, val );
  4528. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4529. END;
  4530. END;
  4531. IF parallel & (M > L2BlockM) THEN
  4532. inc := Align( M DIV nrProcesses, L2BlockM ); M1 := 0;
  4533. i := 0;
  4534. WHILE (M1 < M) DO
  4535. M2 := M1 + inc;
  4536. IF M2 > M THEN M2 := M END;
  4537. NEW( obj[i], adrA + M1 * Align2( K ) * 8, adrB,
  4538. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  4539. L2BlockM, L2BlockN, L2BlockK );
  4540. M1 := M2; INC( i );
  4541. END;
  4542. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  4543. ELSE
  4544. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  4545. L2BlockN, L2BlockK );
  4546. END;
  4547. Toc( t, compT ); cachePool.Release( cache );
  4548. END MultiplyX;
  4549. PROCEDURE MultiplyR( A, B, C: ADDRESS;
  4550. M, N, K, L2BlockM, L2BlockN, L2BlockK: SIZE;
  4551. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  4552. add: BOOLEAN );
  4553. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  4554. M1, M2, i: SIZE; val: REAL; inc: SIZE;
  4555. obj: POINTER TO ARRAY OF MultiplyObjectR;
  4556. t: HUGEINT; cache: Cache;
  4557. BEGIN
  4558. NEW(obj,nrProcesses+1);
  4559. lenA := M * Align4( K ) * 4; lenB := N * Align4( K ) * 4;
  4560. cache := cachePool.Acquire( lenA + lenB );
  4561. adrA := cache.adr; adrB := adrA + lenA;
  4562. CopyAR( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  4563. CopyBR( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  4564. Tic( t ); m := M; adrC := C;
  4565. IF ~add THEN
  4566. WHILE (m > 0) DO
  4567. ZeroRI( adrC, IncC, N ); INC( adrC, StrideC );
  4568. DEC( m );
  4569. END;
  4570. END;
  4571. Toc( t, zeroT );
  4572. IF debug THEN
  4573. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  4574. FOR i := 0 TO M * Align4( K ) - 1 DO
  4575. SYSTEM.GET( adrA + i * 4, val );
  4576. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4577. END;
  4578. END;
  4579. IF debug THEN
  4580. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  4581. FOR i := 0 TO N * Align4( K ) - 1 DO
  4582. SYSTEM.GET( adrB + i * 4, val );
  4583. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4584. END;
  4585. END;
  4586. IF parallel & (M > L2BlockM) THEN
  4587. inc := Align( M DIV nrProcesses, L2BlockM ); M1 := 0;
  4588. i := 0;
  4589. WHILE (M1 < M) DO
  4590. M2 := M1 + inc;
  4591. IF M2 > M THEN M2 := M END;
  4592. NEW( obj[i], adrA + M1 * Align4( K ) * 4, adrB,
  4593. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  4594. L2BlockM, L2BlockN, L2BlockK );
  4595. M1 := M2; INC( i );
  4596. END;
  4597. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  4598. ELSE
  4599. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  4600. L2BlockN, L2BlockK );
  4601. END;
  4602. Toc( t, compT ); cachePool.Release( cache );
  4603. END MultiplyR;
  4604. (*
  4605. PROCEDURE DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  4606. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  4607. A, B, C, D: ARRAY [ .. , .. ] OF LONGREAL;
  4608. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  4609. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: LONGREAL; atime, time: LONGINT;
  4610. BEGIN
  4611. KernelLog.String( "LONGREAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  4612. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  4613. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  4614. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  4615. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMX( A ); FillMX( B );
  4616. IF debug THEN DispMX( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMX( B );
  4617. END;
  4618. atime := Input.Time(); (* C := 0; *)
  4619. WHILE (iter > 0) DO
  4620. MultiplyX( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4621. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  4622. (*
  4623. 8,
  4624. LEN( A, 1 ) * 8, 8, LEN( B, 1 ) * 8, 8, LEN( C, 1 ) * 8
  4625. *)
  4626. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4627. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  4628. );
  4629. DEC( iter );
  4630. END;
  4631. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  4632. IF debug THEN
  4633. DispMX( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMX( B ); KernelLog.String( " = " );
  4634. KernelLog.Ln; DispMX( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  4635. END;
  4636. IF check THEN
  4637. (*
  4638. NEW(D,M,N);
  4639. MatMulAXAXNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4640. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4641. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  4642. *)
  4643. D := A * B;
  4644. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  4645. END;
  4646. END DoTestX;
  4647. PROCEDURE DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  4648. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  4649. A, B, C, D: ARRAY [ .. , .. ] OF REAL;
  4650. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  4651. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: REAL; atime, time: LONGINT;
  4652. BEGIN
  4653. KernelLog.String( "REAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  4654. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  4655. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  4656. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  4657. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMR( A ); FillMR( B );
  4658. IF debug THEN DispMR( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMR( B );
  4659. END;
  4660. atime := Input.Time(); (* C := 0; *)
  4661. FOR i := 1 TO iter DO
  4662. MultiplyR( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4663. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  4664. (* 4,
  4665. LEN( A, 1 ) * 4, 4, LEN( B, 1 ) * 4, 4, LEN( C, 1 ) * 4 *)
  4666. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4667. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  4668. );
  4669. END;
  4670. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  4671. IF debug THEN
  4672. DispMR( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMR( B ); KernelLog.String( " = " );
  4673. KernelLog.Ln; DispMR( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  4674. END;
  4675. IF check THEN
  4676. (*
  4677. NEW(D,M,N);
  4678. MatMulARARNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4679. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4680. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  4681. *)
  4682. D := A * B;
  4683. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  4684. END;
  4685. END DoTestR;
  4686. PROCEDURE RandTestR*;
  4687. VAR iter, i, time: LONGINT;
  4688. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  4689. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  4690. BEGIN
  4691. IF Min = Max THEN RETURN Min
  4692. ELSE RETURN ran.Dice( Max - Min ) + Min
  4693. END;
  4694. END Ran;
  4695. BEGIN
  4696. In.Open(); In.LongInt( iter );
  4697. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  4698. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  4699. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  4700. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  4701. K := Ran( MinK, MaxK );
  4702. IF N < 5 THEN N := 5 END;
  4703. IF K < 4 THEN K := 4 END;
  4704. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  4705. BN := Align( BN, 5 );
  4706. IF BN > N THEN DEC( BN, 5 ) END;
  4707. BK := Align( BK, 4 );
  4708. IF BK > K THEN DEC( BK, 4 ) END;
  4709. DoTestR( M, N, K, BM, BN, BK, TRUE , 1 );
  4710. END;
  4711. END RandTestR;
  4712. PROCEDURE RandTestX*;
  4713. VAR iter, i, time: LONGINT;
  4714. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  4715. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  4716. BEGIN
  4717. IF Min = Max THEN RETURN Min
  4718. ELSE RETURN ran.Dice( Max - Min ) + Min
  4719. END;
  4720. END Ran;
  4721. BEGIN
  4722. In.Open(); In.LongInt( iter );
  4723. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  4724. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  4725. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  4726. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  4727. K := Ran( MinK, MaxK );
  4728. IF N < 5 THEN N := 5 END;
  4729. IF K < 4 THEN K := 4 END;
  4730. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  4731. BN := Align( BN, 5 );
  4732. IF BN > N THEN DEC( BN, 5 ) END;
  4733. BK := Align( BK, 4 );
  4734. IF BK > K THEN DEC( BK, 4 ) END;
  4735. DoTestX( M, N, K, BM, BN, BK, TRUE , 1 );
  4736. END;
  4737. END RandTestX;
  4738. *)
  4739. (*
  4740. PROCEDURE Times*;
  4741. VAR all: HUGEINT;
  4742. BEGIN
  4743. all := allocT + copyT + zeroT + compT; KernelLog.String( "alloc=" );
  4744. KernelLog.LongRealFix( allocT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4745. KernelLog.Int( ENTIER( 100 * allocT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4746. KernelLog.Ln; KernelLog.String( "copy=" );
  4747. KernelLog.LongRealFix( copyT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4748. KernelLog.Int( ENTIER( 100 * copyT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4749. KernelLog.Ln; KernelLog.String( "zero=" );
  4750. KernelLog.LongRealFix( zeroT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4751. KernelLog.Int( ENTIER( 100 * zeroT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4752. KernelLog.Ln; KernelLog.String( "comp=" );
  4753. KernelLog.LongRealFix( compT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4754. KernelLog.Int( ENTIER( 100 * compT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4755. KernelLog.Ln;
  4756. END Times;
  4757. *)
  4758. (*
  4759. PROCEDURE TestRMM*;
  4760. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  4761. check, iter: LONGINT;
  4762. BEGIN
  4763. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  4764. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  4765. In.LongInt( iter ); In.LongInt( check );
  4766. IF L2BlockM = 0 THEN
  4767. MagicBlockR( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  4768. END;
  4769. DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  4770. END TestRMM;
  4771. PROCEDURE TestXMM*;
  4772. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  4773. iter, check: LONGINT;
  4774. BEGIN
  4775. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  4776. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  4777. In.LongInt( iter ); In.LongInt( check );
  4778. IF L2BlockM = 0 THEN
  4779. MagicBlockX( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  4780. END;
  4781. DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  4782. END TestXMM;
  4783. *)
  4784. (****** matrix multiplication using fast scalar product ******)
  4785. PROCEDURE MatMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4786. BEGIN
  4787. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  4788. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4789. END MatMulAXAXLoopA;
  4790. PROCEDURE MatMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4791. BEGIN
  4792. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  4793. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4794. END MatMulAXAXLoopSSE;
  4795. PROCEDURE MatMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4796. BEGIN
  4797. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  4798. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4799. END MatMulARARLoopA;
  4800. PROCEDURE MatMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4801. BEGIN
  4802. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  4803. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4804. END MatMulARARLoopSSE;
  4805. PROCEDURE MatMulIncAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4806. BEGIN
  4807. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4808. END MatMulIncAXAXLoopA;
  4809. PROCEDURE MatMulIncAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4810. BEGIN
  4811. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4812. END MatMulIncAXAXLoopSSE;
  4813. PROCEDURE MatMulIncARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4814. BEGIN
  4815. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4816. END MatMulIncARARLoopA;
  4817. PROCEDURE MatMulIncARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4818. BEGIN
  4819. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4820. END MatMulIncARARLoopSSE;
  4821. (****** matrix multiplication over rows with transposition of B *)
  4822. PROCEDURE MatMulHBlockR( MatrixA, MatrixB, MatrixC: ADDRESS;
  4823. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  4824. add: BOOLEAN );
  4825. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  4826. (*
  4827. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  4828. *)
  4829. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  4830. VAR i, j: SIZE; adrA, adrB, adrC: ADDRESS;
  4831. BEGIN
  4832. FOR i := fromA TO toA - 1 DO
  4833. adrA := MatrixA + i * Stride;
  4834. FOR j := fromB TO toB - 1 DO
  4835. adrB := MatrixB + j * Stride;
  4836. adrC := MatrixC + i * StrideC + j * IncC;
  4837. AlignedSPRSSE( adrA, adrB, adrC, Cols, add );
  4838. END;
  4839. END;
  4840. END Block;
  4841. BEGIN
  4842. IF cBlockSize = 0 THEN
  4843. BlockSize := L2CacheSize DIV Stride DIV 4;
  4844. ELSE BlockSize := cBlockSize;
  4845. END;
  4846. lastUsedBlockSize := BlockSize;
  4847. fromA := 0;
  4848. REPEAT
  4849. toA := fromA + BlockSize;
  4850. IF toA > RowsA THEN toA := RowsA END;
  4851. fromB := 0;
  4852. REPEAT
  4853. toB := fromB + BlockSize;
  4854. IF toB > RowsB THEN toB := RowsB END;
  4855. Block( fromA, toA, fromB, toB ); fromB := toB;
  4856. UNTIL toB = RowsB;
  4857. fromA := toA;
  4858. UNTIL toA = RowsA;
  4859. END MatMulHBlockR;
  4860. PROCEDURE MatMulHBlockX( MatrixA, MatrixB, MatrixC: ADDRESS;
  4861. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  4862. add: BOOLEAN );
  4863. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  4864. (*
  4865. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  4866. *)
  4867. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  4868. VAR adrA, adrB, adrC: ADDRESS; i, j: SIZE;
  4869. BEGIN
  4870. FOR i := fromA TO toA - 1 DO
  4871. adrA := MatrixA + i * Stride;
  4872. FOR j := fromB TO toB - 1 DO
  4873. adrB := MatrixB + j * Stride;
  4874. adrC := MatrixC + i * StrideC + j * IncC;
  4875. AlignedSPXSSE( adrA, adrB, adrC, Cols, add );
  4876. END;
  4877. END;
  4878. END Block;
  4879. BEGIN
  4880. IF cBlockSize = 0 THEN
  4881. BlockSize := L2CacheSize DIV Stride DIV 8;
  4882. ELSE BlockSize := cBlockSize;
  4883. END;
  4884. lastUsedBlockSize := BlockSize;
  4885. fromA := 0;
  4886. REPEAT
  4887. toA := fromA + BlockSize;
  4888. IF toA > RowsA THEN toA := RowsA END;
  4889. fromB := 0;
  4890. REPEAT
  4891. toB := fromB + BlockSize;
  4892. IF toB > RowsB THEN toB := RowsB END;
  4893. Block( fromA, toA, fromB, toB ); fromB := toB;
  4894. UNTIL toB = RowsB;
  4895. fromA := toA;
  4896. UNTIL toA = RowsA;
  4897. END MatMulHBlockX;
  4898. PROCEDURE CopyDataR( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  4899. VAR i: SIZE; t: HUGEINT;
  4900. BEGIN
  4901. Tic( t );
  4902. FOR i := 0 TO rows - 1 DO
  4903. Copy4( src, dest, incSrc, incDest, cols );
  4904. INC( src, strideSrc ); INC( dest, strideDest );
  4905. END;
  4906. Toc( t, copyT );
  4907. END CopyDataR;
  4908. PROCEDURE CopyDataX( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  4909. VAR i: SIZE; t: HUGEINT;
  4910. BEGIN
  4911. Tic( t );
  4912. FOR i := 0 TO rows - 1 DO
  4913. Copy8( src, dest, incSrc, incDest, cols );
  4914. INC( src, strideSrc ); INC( dest, strideDest );
  4915. END;
  4916. Toc( t, copyT );
  4917. END CopyDataX;
  4918. PROCEDURE MatMulARARTransposed( matrixA, matrixB, matrixC: ADDRESS;
  4919. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  4920. add: BOOLEAN ): BOOLEAN;
  4921. VAR stride: SIZE; adrB, adrC: ADDRESS;
  4922. proc: POINTER TO ARRAY OF MatMulHObjR;
  4923. from, to0, i: SIZE; cacheA, cacheB: Cache;
  4924. t: HUGEINT;
  4925. BEGIN
  4926. NEW(proc,nrProcesses);
  4927. ASSERT( ColsA = RowsB );
  4928. (* allocate 128 bit = 16 byte aligned matrix *)
  4929. stride := Align( ColsA * SIZEOF( REAL ), 16 );
  4930. IF (IncA # SIZEOF( REAL )) OR (StrideA # stride) OR
  4931. (matrixA MOD 16 # 0) THEN
  4932. cacheA := cachePool.Acquire( stride * RowsA );
  4933. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  4934. SIZEOF( REAL ), stride, RowsA, ColsA ); (* copy to array *)
  4935. matrixA := cacheA.adr;
  4936. ELSE cacheA := NIL;
  4937. END;
  4938. IF (StrideB # SIZEOF( REAL )) OR (IncB # stride) OR
  4939. (matrixB MOD 16 # 0) THEN
  4940. cacheB := cachePool.Acquire( stride * ColsB );
  4941. CopyDataR( matrixB, cacheB.adr, StrideB, IncB,
  4942. SIZEOF( REAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  4943. matrixB := cacheB.adr;
  4944. ELSE cacheB := NIL;
  4945. END;
  4946. Tic( t );
  4947. (*! needs decision rule if to split by rows or columns *)
  4948. IF nrProcesses > 1 THEN
  4949. from := 0;
  4950. FOR i := 0 TO nrProcesses - 1 DO
  4951. (*
  4952. to := RowsA * (i + 1) DIV nrProcesses; adrA := matrixA + from * stride;
  4953. adrC := matrixC + from * StrideC;
  4954. *)
  4955. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  4956. adrB := matrixB + from * stride;
  4957. adrC := matrixC + from * IncC;
  4958. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  4959. RowsA, to0 - from, RowsB, add );
  4960. from := to0;
  4961. END;
  4962. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  4963. ELSE
  4964. MatMulHBlockR( matrixA, matrixB, matrixC, stride, IncC,
  4965. StrideC, RowsA, ColsB, RowsB, add );
  4966. END;
  4967. Toc( t, compT ); cachePool.Release( cacheA );
  4968. cachePool.Release( cacheB ); RETURN TRUE;
  4969. END MatMulARARTransposed;
  4970. PROCEDURE MatMulAXAXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  4971. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  4972. add: BOOLEAN ): BOOLEAN;
  4973. VAR stride: SIZE; adrB, adrC: ADDRESS;
  4974. proc: POINTER TO ARRAY OF MatMulHObjX;
  4975. from, to0, i: SIZE; cacheA, cacheB: Cache;
  4976. t: HUGEINT;
  4977. BEGIN
  4978. NEW(proc,nrProcesses);
  4979. ASSERT( ColsA = RowsB );
  4980. stride := Align( ColsA * SIZEOF( LONGREAL ), 16 );
  4981. IF (IncA # SIZEOF( LONGREAL )) OR (StrideA # stride) OR
  4982. (matrixA MOD 16 # 0) THEN
  4983. cacheA := cachePool.Acquire( stride * RowsA );
  4984. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  4985. SIZEOF( LONGREAL ), stride, RowsA, ColsA ); (* copy to array *)
  4986. matrixA := cacheA.adr;
  4987. ELSE cacheA := NIL;
  4988. END;
  4989. IF (StrideB # SIZEOF( LONGREAL )) OR (IncB # stride) OR
  4990. (matrixB MOD 16 # 0) THEN
  4991. cacheB := cachePool.Acquire( stride * ColsB );
  4992. CopyDataX( matrixB, cacheB.adr, StrideB, IncB,
  4993. SIZEOF( LONGREAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  4994. matrixB := cacheB.adr;
  4995. ELSE cacheB := NIL;
  4996. END;
  4997. Tic( t );
  4998. IF nrProcesses > 1 THEN
  4999. from := 0;
  5000. FOR i := 0 TO nrProcesses - 1 DO
  5001. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5002. adrB := matrixB + from * stride;
  5003. adrC := matrixC + from * IncC;
  5004. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5005. RowsA, to0 - from, RowsB, add );
  5006. from := to0;
  5007. END;
  5008. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5009. ELSE
  5010. MatMulHBlockX( matrixA, matrixB, matrixC, stride, IncC,
  5011. StrideC, RowsA, ColsB, RowsB, add );
  5012. END;
  5013. Toc( t, compT ); cachePool.Release( cacheA );
  5014. cachePool.Release( cacheB ); RETURN TRUE;
  5015. END MatMulAXAXTransposed;
  5016. (****** strided matrix multiplication with restrictions to increments ******)
  5017. PROCEDURE MatMulARARSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5018. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5019. add: BOOLEAN ): BOOLEAN;
  5020. VAR sum: REAL; CbFrom, i, j, k: SIZE; valA, valB: REAL;
  5021. adrA, adrB, adrC: ADDRESS;
  5022. cacheA, cacheB, cacheC: Cache;
  5023. matrixCO, StrideCO, IncCO: SIZE; t: HUGEINT;
  5024. (*VAR fromA, toA: LONGINT; *)
  5025. BEGIN
  5026. IF (IncA # SIZEOF( REAL )) THEN
  5027. cacheA :=
  5028. cachePool.Acquire( RowsA * ColsA * SIZEOF( REAL ) );
  5029. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5030. SIZEOF( REAL ), SIZEOF( REAL ) * ColsA, RowsA,
  5031. ColsA );
  5032. matrixA := cacheA.adr; IncA := SIZEOF( REAL );
  5033. StrideA := SIZEOF( REAL ) * ColsA;
  5034. END;
  5035. IF (IncB # SIZEOF( REAL )) THEN
  5036. cacheB :=
  5037. cachePool.Acquire( RowsB * ColsB * SIZEOF( REAL ) );
  5038. CopyDataR( matrixB, cacheB.adr, IncB, StrideB,
  5039. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsB,
  5040. ColsB );
  5041. matrixB := cacheB.adr; IncB := SIZEOF( REAL );
  5042. StrideB := SIZEOF( REAL ) * ColsB;
  5043. END;
  5044. IF (IncC # SIZEOF( REAL )) THEN
  5045. cacheC :=
  5046. cachePool.Acquire( RowsA * ColsB * SIZEOF( REAL ) );
  5047. CopyDataR( matrixC, cacheC.adr, IncC, StrideC,
  5048. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsA,
  5049. ColsB );
  5050. matrixCO := matrixC; StrideCO := StrideC;
  5051. IncCO := IncC; matrixC := cacheC.adr;
  5052. IncC := SIZEOF( REAL ); StrideC := SIZEOF( REAL ) * ColsB;
  5053. END;
  5054. Tic( t );
  5055. CbFrom := 0;
  5056. IF ColsB >= 24 THEN
  5057. SSEMul24BlockR( CbFrom, StrideA, StrideB, StrideC,
  5058. ColsA, RowsA, ColsB, RowsB, matrixA,
  5059. matrixB, matrixC, add );
  5060. END;
  5061. IF ColsB - CbFrom >= 16 THEN
  5062. SSEMul16BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5063. CbFrom, matrixA, matrixB, matrixC, add );
  5064. INC( CbFrom, 16 );
  5065. END;
  5066. IF ColsB - CbFrom >= 8 THEN
  5067. SSEMul8BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5068. CbFrom, matrixA, matrixB, matrixC, add );
  5069. INC( CbFrom, 8 );
  5070. END;
  5071. IF ColsB - CbFrom >= 4 THEN
  5072. SSEMul4BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5073. CbFrom, matrixA, matrixB, matrixC, add );
  5074. INC( CbFrom, 4 );
  5075. END;
  5076. IF ColsB - CbFrom > 0 THEN
  5077. (* do it in Oberon *)
  5078. FOR i := 0 TO RowsA - 1 DO
  5079. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5080. FOR j := CbFrom TO ColsB - 1 DO
  5081. adrA := matrixA + i * StrideA;
  5082. adrB := matrixB + j * IncB;
  5083. IF add THEN SYSTEM.GET( adrC, sum )
  5084. ELSE sum := 0
  5085. END;
  5086. FOR k := 0 TO RowsB - 1 DO
  5087. SYSTEM.GET( adrA, valA );
  5088. SYSTEM.GET( adrB, valB );
  5089. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5090. INC( adrA, IncA ); INC( adrB, StrideB );
  5091. END;
  5092. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5093. (* C[i, j] := sum; *)
  5094. END;
  5095. END;
  5096. END;
  5097. Toc( t, compT );
  5098. IF cacheC # NIL THEN
  5099. CopyDataR( matrixC, matrixCO, IncC, StrideC, IncCO,
  5100. StrideCO, RowsA, ColsB );
  5101. END;
  5102. cachePool.Release( cacheA );
  5103. cachePool.Release( cacheB );
  5104. cachePool.Release( cacheC );
  5105. RETURN TRUE;
  5106. END MatMulARARSSEStride;
  5107. PROCEDURE MatMulAXAXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5108. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5109. add: BOOLEAN ): BOOLEAN;
  5110. VAR sum: LONGREAL; CbFrom, i, j, k: SIZE;
  5111. valA, valB: LONGREAL; adrA, adrB, adrC: ADDRESS;
  5112. cacheA, cacheB, cacheC: Cache;
  5113. matrixCO, StrideCO, IncCO:SIZE; t: HUGEINT;
  5114. BEGIN
  5115. IF (IncA # SIZEOF( LONGREAL )) THEN
  5116. cacheA :=
  5117. cachePool.Acquire( RowsA * ColsA * SIZEOF( LONGREAL ) );
  5118. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5119. SIZEOF( LONGREAL ),
  5120. SIZEOF( LONGREAL ) * ColsA, RowsA, ColsA );
  5121. matrixA := cacheA.adr;
  5122. StrideA := SIZEOF( LONGREAL ) * ColsA;
  5123. IncA := SIZEOF( LONGREAL );
  5124. END;
  5125. IF (IncB # SIZEOF( LONGREAL )) THEN
  5126. cacheB :=
  5127. cachePool.Acquire( RowsB * ColsB * SIZEOF( LONGREAL ) );
  5128. CopyDataX( matrixB, cacheB.adr, IncB, StrideB,
  5129. SIZEOF( LONGREAL ),
  5130. SIZEOF( LONGREAL ) * ColsB, RowsB, ColsB );
  5131. matrixB := cacheB.adr;
  5132. StrideB := SIZEOF( LONGREAL ) * ColsB;
  5133. IncB := SIZEOF( LONGREAL );
  5134. END;
  5135. IF (IncC # SIZEOF( LONGREAL )) THEN
  5136. cacheC :=
  5137. cachePool.Acquire( RowsA * ColsB * SIZEOF( LONGREAL ) );
  5138. CopyDataX( matrixC, cacheC.adr, IncC, StrideC,
  5139. SIZEOF( LONGREAL ),
  5140. SIZEOF( LONGREAL ) * ColsB, RowsA, ColsB );
  5141. matrixCO := matrixC; StrideCO := StrideC;
  5142. IncCO := IncC; StrideC := SIZEOF( LONGREAL ) * ColsB;
  5143. IncC := SIZEOF( LONGREAL ); matrixC := cacheC.adr;
  5144. END;
  5145. Tic( t );
  5146. CbFrom := 0;
  5147. IF ColsB >= 12 THEN
  5148. SSEMul12BlockX( CbFrom, StrideA, StrideB, StrideC,
  5149. ColsA, RowsA, ColsB, RowsB, matrixA,
  5150. matrixB, matrixC, add );
  5151. END;
  5152. IF ColsB - CbFrom >= 8 THEN
  5153. SSEMul8BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5154. CbFrom, matrixA, matrixB, matrixC, add );
  5155. INC( CbFrom, 8 );
  5156. END;
  5157. IF ColsB - CbFrom >= 4 THEN
  5158. SSEMul4BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5159. CbFrom, matrixA, matrixB, matrixC, add );
  5160. INC( CbFrom, 4 );
  5161. END;
  5162. IF ColsB - CbFrom >= 2 THEN
  5163. SSEMul2BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5164. CbFrom, matrixA, matrixB, matrixC, add );
  5165. INC( CbFrom, 2 );
  5166. END;
  5167. IF ColsB - CbFrom > 0 THEN
  5168. (* do it in Oberon *)
  5169. FOR i := 0 TO RowsA - 1 DO
  5170. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5171. FOR j := CbFrom TO ColsB - 1 DO
  5172. adrA := matrixA + i * StrideA;
  5173. adrB := matrixB + j * IncB;
  5174. IF add THEN SYSTEM.GET( adrC, sum )
  5175. ELSE sum := 0
  5176. END;
  5177. FOR k := 0 TO RowsB - 1 DO
  5178. SYSTEM.GET( adrA, valA );
  5179. SYSTEM.GET( adrB, valB );
  5180. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5181. INC( adrA, IncA ); INC( adrB, StrideB );
  5182. END;
  5183. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5184. (* C[i, j] := sum; *)
  5185. END;
  5186. END;
  5187. END;
  5188. Toc( t, compT );
  5189. IF cacheC # NIL THEN
  5190. CopyDataX( matrixC, matrixCO, IncC, StrideC, IncCO,
  5191. StrideCO, RowsA, ColsB );
  5192. END;
  5193. cachePool.Release( cacheA );
  5194. cachePool.Release( cacheB );
  5195. cachePool.Release( cacheC );
  5196. RETURN TRUE;
  5197. END MatMulAXAXSSEStride;
  5198. (****** naiive Oberon matrix multiplication ******)
  5199. PROCEDURE MatMulARARNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5200. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5201. add: BOOLEAN );
  5202. (*
  5203. A is M x K matrix, M=rows (A); K=cols(A);
  5204. B is K x N matrix; K=rows(B); N = cols(B);
  5205. C is M x N matrix; M=rows(C); N=cols(C);
  5206. *)
  5207. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5208. val1, val2, sum: REAL; t: HUGEINT;
  5209. BEGIN
  5210. Tic( t );
  5211. FOR i := 1 TO M DO
  5212. adrC := matrixC; adrB := matrixB;
  5213. FOR j := 1 TO N DO
  5214. adrA := matrixA; innerB := adrB;
  5215. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5216. FOR k := 1 TO K DO
  5217. SYSTEM.GET( adrA, val1 );
  5218. SYSTEM.GET( innerB, val2 );
  5219. sum := sum + val1 * val2; INC( adrA, IncA );
  5220. INC( innerB, StrideB );
  5221. END;
  5222. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5223. INC( adrC, IncC );
  5224. END;
  5225. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5226. END;
  5227. Toc( t, compT );
  5228. END MatMulARARNaiive;
  5229. PROCEDURE MatMulAXAXNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5230. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5231. add: BOOLEAN );
  5232. (*
  5233. A is M x K matrix, M=rows (A); K=cols(A);
  5234. B is K x N matrix; K=rows(B); N = cols(B);
  5235. C is M x N matrix; M=rows(C); N=cols(C);
  5236. *)
  5237. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5238. val1, val2, sum: LONGREAL; t: HUGEINT;
  5239. BEGIN
  5240. Tic( t );
  5241. FOR i := 1 TO M DO
  5242. adrC := matrixC; adrB := matrixB;
  5243. FOR j := 1 TO N DO
  5244. adrA := matrixA; innerB := adrB;
  5245. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5246. FOR k := 1 TO K DO
  5247. SYSTEM.GET( adrA, val1 );
  5248. SYSTEM.GET( innerB, val2 );
  5249. sum := sum + val1 * val2; INC( adrA, IncA );
  5250. INC( innerB, StrideB );
  5251. END;
  5252. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5253. INC( adrC, IncC );
  5254. END;
  5255. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5256. END;
  5257. Toc( t, compT );
  5258. END MatMulAXAXNaiive;
  5259. (*
  5260. PROCEDURE Toggle( VAR A, B: LONGINT );
  5261. VAR temp: LONGINT;
  5262. BEGIN
  5263. temp := A; A := B; B := temp;
  5264. END Toggle;
  5265. PROCEDURE Transpose( VAR matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: LONGINT );
  5266. (*
  5267. prepare computation of C=A*B via C = (B` * A`)`
  5268. *)
  5269. BEGIN
  5270. Toggle( matrixA, matrixB ); Toggle( IncA, StrideB ); Toggle( StrideA, IncB );
  5271. Toggle( IncC, StrideC ); Toggle( M, N );
  5272. END Transpose;
  5273. *)
  5274. (*
  5275. *)
  5276. PROCEDURE BestMethod( M, N, K: SIZE ): LONGINT;
  5277. BEGIN
  5278. IF M = 1 THEN
  5279. IF N < 32 THEN RETURN cMatMulScalarProduct
  5280. ELSIF N < 256 THEN
  5281. IF K < 256 THEN RETURN cMatMulScalarProduct
  5282. ELSE RETURN cMatMulStride
  5283. END;
  5284. ELSE RETURN cMatMulStride
  5285. END;
  5286. ELSIF N = 1 THEN
  5287. IF (M > 1024) & (K > 1024) THEN
  5288. RETURN cMatMulTransposed
  5289. ELSE RETURN cMatMulScalarProduct
  5290. END;
  5291. ELSIF K = 1 THEN
  5292. IF N < 32 THEN
  5293. IF M < 256 THEN RETURN cMatMulNaive
  5294. ELSE RETURN cMatMulStride
  5295. END;
  5296. ELSIF N < 256 THEN
  5297. IF M < 32 THEN RETURN cMatMulNaive
  5298. ELSE RETURN cMatMulStride
  5299. END;
  5300. ELSE RETURN cMatMulStride
  5301. END;
  5302. ELSIF M < 32 THEN
  5303. IF N < 32 THEN RETURN cMatMulScalarProduct
  5304. ELSIF N < 256 THEN
  5305. IF K < 32 THEN RETURN cMatMulScalarProduct
  5306. ELSE RETURN cMatMulStride
  5307. END;
  5308. ELSE RETURN cMatMulStride
  5309. END;
  5310. ELSIF M < 256 THEN
  5311. IF N < 32 THEN
  5312. IF K < 32 THEN RETURN cMatMulScalarProduct
  5313. ELSE RETURN cMatMulStride
  5314. END;
  5315. ELSE
  5316. IF K < 256 THEN RETURN cMatMulStride
  5317. ELSE RETURN cMatMulBlocked
  5318. END;
  5319. END;
  5320. ELSE
  5321. IF N < 32 THEN RETURN cMatMulStride ELSE
  5322. IF K < 256 THEN RETURN cMatMulStride
  5323. ELSE RETURN cMatMulBlocked
  5324. END;
  5325. END;
  5326. END;
  5327. RETURN cMatMulStride;
  5328. END BestMethod;
  5329. (*
  5330. (N) (K) (N)
  5331. CCCCCC AAAAA BBBBB
  5332. CCCCCC AAAAA BBBBB
  5333. (M) CCCCCC = (M) AAAAA * (K) BBBBB
  5334. CCCCCC AAAAA BBBBB
  5335. CCCCCC AAAAA BBBBB
  5336. *)
  5337. PROCEDURE MatMulR( matrixA, matrixB, matrixC: ADDRESS;
  5338. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5339. (*! heuristics for choice of different methods needs improvement *)
  5340. (*! transpose if superior*)
  5341. (*! provide special variant for small [up to 4x4] matrices *)
  5342. VAR M, N, K: SIZE;
  5343. BEGIN
  5344. ASSERT( ColsA = RowsB );
  5345. M := RowsA; N := ColsB; K := ColsA;
  5346. CASE BestMethod( M, N, K ) OF
  5347. | cMatMulScalarProduct:
  5348. RETURN FALSE;
  5349. | cMatMulNaive:
  5350. RETURN MatMulRNaive( matrixA, matrixB, matrixC, IncA,
  5351. StrideA, IncB, StrideB, IncC,
  5352. StrideC, RowsA, ColsA, RowsB,
  5353. ColsB );
  5354. | cMatMulTransposed:
  5355. RETURN MatMulARARTransposed( matrixA, matrixB,
  5356. matrixC, IncA,
  5357. StrideA, IncB,
  5358. StrideB, IncC,
  5359. StrideC, RowsA,
  5360. ColsA, RowsB,
  5361. ColsB, FALSE );
  5362. | cMatMulStride:
  5363. RETURN MatMulARARSSEStride( matrixA, matrixB,
  5364. matrixC, IncA, StrideA,
  5365. IncB, StrideB, IncC,
  5366. StrideC, RowsA,
  5367. ColsA, RowsB, ColsB,
  5368. FALSE );
  5369. | cMatMulBlocked:
  5370. RETURN MatMulARARBlocked( matrixA, matrixB,
  5371. matrixC, IncA, StrideA,
  5372. IncB, StrideB, IncC,
  5373. StrideC, RowsA, ColsA,
  5374. RowsB, ColsB, FALSE );
  5375. ELSE
  5376. RETURN FALSE (* use scalar product for each row and column *)
  5377. END;
  5378. END MatMulR;
  5379. PROCEDURE MatMulX( matrixA, matrixB, matrixC: ADDRESS;
  5380. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5381. VAR M, N, K: SIZE;
  5382. BEGIN
  5383. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5384. K := ColsA;
  5385. (*
  5386. KernelLog.String("MatMulX, M,N,K = "); KernelLog.Int(M,10); KernelLog.Int(N,10); KernelLog.Int(K,10); KernelLog.Ln;
  5387. KernelLog.String("Method= "); KernelLog.Int( BestMethod(M,N,K),10); KernelLog.Ln;
  5388. *)
  5389. CASE BestMethod( M, N, K ) OF
  5390. | cMatMulScalarProduct:
  5391. RETURN FALSE;
  5392. | cMatMulNaive:
  5393. RETURN MatMulXNaive( matrixA, matrixB, matrixC, IncA,
  5394. StrideA, IncB, StrideB, IncC,
  5395. StrideC, RowsA, ColsA, RowsB,
  5396. ColsB );
  5397. | cMatMulTransposed:
  5398. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  5399. matrixC, IncA,
  5400. StrideA, IncB, StrideB,
  5401. IncC, StrideC, RowsA,
  5402. ColsA, RowsB, ColsB,
  5403. FALSE );
  5404. | cMatMulStride:
  5405. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  5406. matrixC, IncA, StrideA,
  5407. IncB, StrideB, IncC,
  5408. StrideC, RowsA, ColsA,
  5409. RowsB, ColsB,
  5410. FALSE );
  5411. | cMatMulBlocked:
  5412. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  5413. matrixC, IncA, StrideA,
  5414. IncB, StrideB, IncC,
  5415. StrideC, RowsA, ColsA,
  5416. RowsB, ColsB, FALSE );
  5417. ELSE
  5418. RETURN FALSE (* use scalar product for each row and column *)
  5419. END;
  5420. END MatMulX;
  5421. PROCEDURE MatMulIncR( matrixA, matrixB, matrixC: ADDRESS;
  5422. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5423. (*! heuristics for choice of different methods needs improvement *)
  5424. (*! transpose if superior*)
  5425. (*! provide special variant for small [up to 4x4] matrices *)
  5426. VAR M, N, K: SIZE;
  5427. BEGIN
  5428. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5429. K := ColsA;
  5430. CASE BestMethod( M, N, K ) OF
  5431. | cMatMulScalarProduct:
  5432. RETURN FALSE;
  5433. | cMatMulNaive:
  5434. RETURN MatMulIncRNaive( matrixA, matrixB, matrixC,
  5435. IncA, StrideA, IncB, StrideB,
  5436. IncC, StrideC, RowsA, ColsA,
  5437. RowsB, ColsB );
  5438. | cMatMulTransposed:
  5439. RETURN MatMulARARTransposed( matrixA, matrixB,
  5440. matrixC, IncA,
  5441. StrideA, IncB,
  5442. StrideB, IncC,
  5443. StrideC, RowsA,
  5444. ColsA, RowsB,
  5445. ColsB, TRUE );
  5446. | cMatMulStride:
  5447. RETURN MatMulARARSSEStride( matrixA, matrixB,
  5448. matrixC, IncA, StrideA,
  5449. IncB, StrideB, IncC,
  5450. StrideC, RowsA,
  5451. ColsA, RowsB, ColsB,
  5452. TRUE );
  5453. | cMatMulBlocked:
  5454. RETURN MatMulARARBlocked( matrixA, matrixB,
  5455. matrixC, IncA, StrideA,
  5456. IncB, StrideB, IncC,
  5457. StrideC, RowsA, ColsA,
  5458. RowsB, ColsB, TRUE );
  5459. ELSE
  5460. RETURN FALSE (* use scalar product for each row and column *)
  5461. END;
  5462. END MatMulIncR;
  5463. PROCEDURE MatMulIncX( matrixA, matrixB, matrixC: ADDRESS;
  5464. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5465. VAR M, N, K: SIZE;
  5466. BEGIN
  5467. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5468. K := ColsA;
  5469. CASE BestMethod( M, N, K ) OF
  5470. | cMatMulScalarProduct:
  5471. RETURN FALSE;
  5472. | cMatMulNaive:
  5473. RETURN MatMulIncXNaive( matrixA, matrixB, matrixC,
  5474. IncA, StrideA, IncB, StrideB,
  5475. IncC, StrideC, RowsA, ColsA,
  5476. RowsB, ColsB );
  5477. | cMatMulTransposed:
  5478. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  5479. matrixC, IncA,
  5480. StrideA, IncB, StrideB,
  5481. IncC, StrideC, RowsA,
  5482. ColsA, RowsB, ColsB,
  5483. TRUE );
  5484. | cMatMulStride:
  5485. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  5486. matrixC, IncA, StrideA,
  5487. IncB, StrideB, IncC,
  5488. StrideC, RowsA, ColsA,
  5489. RowsB, ColsB, TRUE );
  5490. | cMatMulBlocked:
  5491. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  5492. matrixC, IncA, StrideA,
  5493. IncB, StrideB, IncC,
  5494. StrideC, RowsA, ColsA,
  5495. RowsB, ColsB, TRUE );
  5496. ELSE
  5497. RETURN FALSE (* use scalar product for each row and column *)
  5498. END;
  5499. END MatMulIncX;
  5500. PROCEDURE MatMulARARBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5501. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5502. add: BOOLEAN ): BOOLEAN;
  5503. VAR M, N, K, L2M, L2N, L2K: SIZE;
  5504. BEGIN
  5505. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5506. K := ColsA; MagicBlockR( M, N, K, L2M, L2N, L2K );
  5507. (*
  5508. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  5509. IncC, StrideC, RowsA, ColsB, ColsA );
  5510. *)
  5511. MultiplyR( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  5512. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  5513. StrideC, add );
  5514. RETURN TRUE;
  5515. END MatMulARARBlocked;
  5516. PROCEDURE MatMulAXAXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5517. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5518. add: BOOLEAN ): BOOLEAN;
  5519. VAR M, N, K, L2M, L2N, L2K: SIZE;
  5520. BEGIN
  5521. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5522. K := ColsA; MagicBlockX( M, N, K, L2M, L2N, L2K );
  5523. (*
  5524. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  5525. IncC, StrideC, RowsA, ColsB, ColsA );
  5526. *)
  5527. MultiplyX( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  5528. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  5529. StrideC, add );
  5530. RETURN TRUE;
  5531. END MatMulAXAXBlocked;
  5532. PROCEDURE MatMulRNaive( matrixA, matrixB, matrixC: ADDRESS;
  5533. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5534. BEGIN
  5535. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5536. IncB, StrideB, IncC, StrideC, RowsA,
  5537. ColsB, ColsA, FALSE );
  5538. RETURN TRUE;
  5539. END MatMulRNaive;
  5540. PROCEDURE MatMulXNaive( matrixA, matrixB, matrixC: ADDRESS;
  5541. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5542. BEGIN
  5543. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5544. IncB, StrideB, IncC, StrideC, RowsA,
  5545. ColsB, ColsA, FALSE );
  5546. RETURN TRUE;
  5547. END MatMulXNaive;
  5548. PROCEDURE MatMulIncRNaive( matrixA, matrixB, matrixC: ADDRESS;
  5549. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5550. BEGIN
  5551. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5552. IncB, StrideB, IncC, StrideC, RowsA,
  5553. ColsB, ColsA, TRUE );
  5554. RETURN TRUE;
  5555. END MatMulIncRNaive;
  5556. PROCEDURE MatMulIncXNaive( matrixA, matrixB, matrixC: ADDRESS;
  5557. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5558. BEGIN
  5559. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5560. IncB, StrideB, IncC, StrideC, RowsA,
  5561. ColsB, ColsA, TRUE );
  5562. RETURN TRUE;
  5563. END MatMulIncXNaive;
  5564. PROCEDURE MatMulXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5565. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5566. BEGIN
  5567. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  5568. IncA, StrideA, IncB,
  5569. StrideB, IncC, StrideC,
  5570. RowsA, ColsA, RowsB,
  5571. ColsB, FALSE );
  5572. END MatMulXTransposed;
  5573. PROCEDURE MatMulIncXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5574. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5575. BEGIN
  5576. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  5577. IncA, StrideA, IncB,
  5578. StrideB, IncC, StrideC,
  5579. RowsA, ColsA, RowsB,
  5580. ColsB, TRUE )
  5581. END MatMulIncXTransposed;
  5582. PROCEDURE MatMulRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5583. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5584. BEGIN
  5585. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  5586. IncA, StrideA, IncB,
  5587. StrideB, IncC, StrideC,
  5588. RowsA, ColsA, RowsB,
  5589. ColsB, FALSE );
  5590. END MatMulRTransposed;
  5591. PROCEDURE MatMulIncRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5592. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5593. BEGIN
  5594. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  5595. IncA, StrideA, IncB,
  5596. StrideB, IncC, StrideC,
  5597. RowsA, ColsA, RowsB,
  5598. ColsB, TRUE )
  5599. END MatMulIncRTransposed;
  5600. PROCEDURE MatMulXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5601. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5602. BEGIN
  5603. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  5604. IncA, StrideA, IncB, StrideB,
  5605. IncC, StrideC, RowsA,
  5606. ColsA, RowsB, ColsB,
  5607. FALSE );
  5608. END MatMulXSSEStride;
  5609. PROCEDURE MatMulIncXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5610. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5611. BEGIN
  5612. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  5613. IncA, StrideA, IncB, StrideB,
  5614. IncC, StrideC, RowsA,
  5615. ColsA, RowsB, ColsB,
  5616. TRUE );
  5617. END MatMulIncXSSEStride;
  5618. PROCEDURE MatMulRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5619. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5620. BEGIN
  5621. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  5622. IncA, StrideA, IncB, StrideB,
  5623. IncC, StrideC, RowsA,
  5624. ColsA, RowsB, ColsB,
  5625. FALSE );
  5626. END MatMulRSSEStride;
  5627. PROCEDURE MatMulIncRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5628. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5629. BEGIN
  5630. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  5631. IncA, StrideA, IncB, StrideB,
  5632. IncC, StrideC, RowsA,
  5633. ColsA, RowsB, ColsB,
  5634. TRUE )
  5635. END MatMulIncRSSEStride;
  5636. PROCEDURE MatMulRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5637. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5638. BEGIN
  5639. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  5640. IncA, StrideA, IncB, StrideB,
  5641. IncC, StrideC, RowsA, ColsA,
  5642. RowsB, ColsB, FALSE )
  5643. END MatMulRBlocked;
  5644. PROCEDURE MatMulIncRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5645. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5646. BEGIN
  5647. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  5648. IncA, StrideA, IncB, StrideB,
  5649. IncC, StrideC, RowsA, ColsA,
  5650. RowsB, ColsB, TRUE )
  5651. END MatMulIncRBlocked;
  5652. PROCEDURE MatMulXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5653. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5654. BEGIN
  5655. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  5656. IncA, StrideA, IncB, StrideB,
  5657. IncC, StrideC, RowsA, ColsA,
  5658. RowsB, ColsB, FALSE )
  5659. END MatMulXBlocked;
  5660. PROCEDURE MatMulIncXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5661. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5662. BEGIN
  5663. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  5664. IncA, StrideA, IncB, StrideB,
  5665. IncC, StrideC, RowsA, ColsA,
  5666. RowsB, ColsB, TRUE )
  5667. END MatMulIncXBlocked;
  5668. PROCEDURE SetMatMulMethod*( i: LONGINT );
  5669. BEGIN
  5670. KernelLog.String("ArrayBaseOptimized, method = ");
  5671. IF i = cMatMulDynamic THEN
  5672. KernelLog.String("dynamic.");
  5673. ArrayBase.matMulIncR := MatMulIncR;
  5674. ArrayBase.matMulIncX := MatMulIncX;
  5675. ArrayBase.matMulR := MatMulR;
  5676. ArrayBase.matMulX := MatMulX;
  5677. ELSIF i = cMatMulScalarProduct THEN
  5678. KernelLog.String("scalarproduct.");
  5679. ArrayBase.matMulIncR := NIL;
  5680. ArrayBase.matMulIncX := NIL;
  5681. ArrayBase.matMulR := NIL; ArrayBase.matMulX := NIL;
  5682. ELSIF i = cMatMulNaive THEN
  5683. KernelLog.String("naiive.");
  5684. ArrayBase.matMulR := MatMulRNaive;
  5685. ArrayBase.matMulX := MatMulXNaive;
  5686. ArrayBase.matMulIncR := MatMulIncRNaive;
  5687. ArrayBase.matMulIncX := MatMulIncXNaive;
  5688. ELSIF i = cMatMulTransposed THEN
  5689. KernelLog.String("transposed.");
  5690. ArrayBase.matMulR := MatMulRTransposed;
  5691. ArrayBase.matMulX := MatMulXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  5692. ArrayBase.matMulIncR := MatMulIncRTransposed;
  5693. ArrayBase.matMulIncX := MatMulIncXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  5694. ELSIF i = cMatMulStride THEN
  5695. KernelLog.String("stride.");
  5696. ArrayBase.matMulR := MatMulRSSEStride;
  5697. ArrayBase.matMulX := MatMulXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  5698. ArrayBase.matMulIncR := MatMulIncRSSEStride;
  5699. ArrayBase.matMulIncX := MatMulIncXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  5700. ELSIF i = cMatMulBlocked THEN
  5701. KernelLog.String("blocked.");
  5702. ArrayBase.matMulR := MatMulRBlocked;
  5703. ArrayBase.matMulX := MatMulXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  5704. ArrayBase.matMulIncR := MatMulIncRBlocked;
  5705. ArrayBase.matMulIncX := MatMulIncXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  5706. END;
  5707. KernelLog.Ln;
  5708. END SetMatMulMethod;
  5709. (* optimizations for small arrays (Alexey Morozov) *)
  5710. (* assumes that all arrays do not overlap *)
  5711. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  5712. PROCEDURE MatMulR2x2(dadr, ladr, radr: ADDRESS);
  5713. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  5714. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  5715. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  5716. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  5717. MOVUPS XMM0, [RAX] ; [a00,a01,a10,a11]
  5718. MOVUPS XMM1, [RBX] ; [b00,b01,b10,b11]
  5719. MOVAPS XMM2, XMM1
  5720. SHUFPS XMM2, XMM1, 204 ; XMM2 := [b00,b11,b00,b11]
  5721. MULPS XMM2, XMM0
  5722. SHUFPS XMM0, XMM0, 177 ; XMM0 := [a01,a00,a11,a10]
  5723. SHUFPS XMM1, XMM1, 102 ; XMM1 := [b10,b01,b10,b01]
  5724. MULPS XMM1, XMM0
  5725. ADDPS XMM1, XMM2
  5726. MOVUPS [RCX], XMM1
  5727. END MatMulR2x2;
  5728. (* based on weighted sum of rows (Alexey Morozov) *)
  5729. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  5730. PROCEDURE MatMulR3x3(dadr, ladr, radr: ADDRESS);
  5731. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  5732. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  5733. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  5734. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  5735. MOVUPS XMM0, [RBX] ; XMM0 := [b00,b01,b02,-]
  5736. MOVUPS XMM1, [RBX+12] ; XMM1 := [b10,b11,b12,-]
  5737. ; last element is out of range, is it still OK?
  5738. MOVUPS XMM2, [RBX+24] ; XMM2 := [b20,b21,b22,-]
  5739. ;MOVLPS XMM2, [RBX+24]
  5740. ;MOVSS XMM3, [RBX+32]
  5741. ;MOVLHPS XMM2, XMM3
  5742. MOVSS XMM3, [RAX]
  5743. SHUFPS XMM3, XMM3, 0; XMM3 := [a00,a00,a00,-]
  5744. MOVAPS XMM4, XMM0
  5745. MULPS XMM4, XMM3
  5746. MOVSS XMM3, [RAX+4]
  5747. SHUFPS XMM3, XMM3, 0; XMM3 := [a01,a01,a01,-]
  5748. MULPS XMM3, XMM1
  5749. ADDPS XMM4, XMM3
  5750. MOVSS XMM3, [RAX+8]
  5751. SHUFPS XMM3, XMM3, 0; XMM3 := [a02,a02,a02,-]
  5752. MULPS XMM3, XMM2
  5753. ADDPS XMM4, XMM3
  5754. MOVUPS [RCX], XMM4
  5755. ;***************************************************;
  5756. MOVSS XMM3, [RAX+12]
  5757. SHUFPS XMM3, XMM3, 0; XMM3 := [a10,a10,a10,-]
  5758. MOVAPS XMM4, XMM0
  5759. MULPS XMM4, XMM3
  5760. MOVSS XMM3, [RAX+16]
  5761. SHUFPS XMM3, XMM3, 0; XMM3 := [a11,a11,a11,-]
  5762. MULPS XMM3, XMM1
  5763. ADDPS XMM4, XMM3
  5764. MOVSS XMM3, [RAX+20]
  5765. SHUFPS XMM3, XMM3, 0; XMM3 := [a12,a12,a12,-]
  5766. MULPS XMM3, XMM2
  5767. ADDPS XMM4, XMM3
  5768. MOVUPS [RCX+12], XMM4
  5769. ;***************************************************;
  5770. MOVSS XMM3, [RAX+24]
  5771. SHUFPS XMM3, XMM3, 0; XMM3 := [a20,a20,a20,-]
  5772. MOVAPS XMM4, XMM0
  5773. MULPS XMM4, XMM3
  5774. MOVSS XMM3, [RAX+28]
  5775. SHUFPS XMM3, XMM3, 0; XMM3 := [a21,a21,a21,-]
  5776. MULPS XMM3, XMM1
  5777. ADDPS XMM4, XMM3
  5778. MOVSS XMM3, [RAX+32]
  5779. SHUFPS XMM3, XMM3, 0; XMM3 := [a22,a22,a22,-]
  5780. MULPS XMM3, XMM2
  5781. ADDPS XMM4, XMM3
  5782. ;MOVUPS [RCX+24], XMM4
  5783. MOVLPS [RCX+24], XMM4
  5784. MOVHLPS XMM4, XMM4
  5785. MOVSS [RCX+32], XMM4
  5786. END MatMulR3x3;
  5787. (* based on Strassen algorithm (Alexey Morozov) *)
  5788. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  5789. PROCEDURE MatMulR4x4(dadr, ladr, radr: ADDRESS);
  5790. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  5791. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  5792. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  5793. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  5794. ; load A00
  5795. MOVLPS XMM0, [RAX] ; XMM0 := [a00,a01,-,-]
  5796. MOVHPS XMM0, [RAX+16] ; XMM0 := [a00,a01,a10,a11]
  5797. ; load A01
  5798. MOVLPS XMM1, [RAX+8] ; XMM1 := [a02,a03,-,-]
  5799. MOVHPS XMM1, [RAX+24] ; XMM1 := [a02,a03,a12,a13]
  5800. ; load B00
  5801. MOVLPS XMM2, [RBX] ; XMM2 := [b00,b01,-,-]
  5802. MOVHPS XMM2, [RBX+16] ; XMM2 := [b00,b01,b10,b11]
  5803. ; load B01
  5804. MOVLPS XMM3, [RBX+8] ; XMM3 := [a02,a03,-,-]
  5805. MOVHPS XMM3, [RBX+24] ; XMM3 := [a02,a03,a12,a13]
  5806. ; load B10
  5807. MOVLPS XMM4, [RBX+32] ; XMM4 := [b20,b21,-,-]
  5808. MOVHPS XMM4, [RBX+48] ; XMM4 := [b20,b21,b30,b31]
  5809. ; load B11
  5810. MOVLPS XMM5, [RBX+40] ; XMM5 := [b22,b23,-,-]
  5811. MOVHPS XMM5, [RBX+56] ; XMM5 := [b22,b23,b32,b33]
  5812. ;****************************************************;
  5813. ; multiply A00(D)*B00(E) (use MatMulR2x2 code)
  5814. MOVAPS XMM6, XMM2
  5815. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5816. MULPS XMM6, XMM0
  5817. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5818. MOVAPS XMM7, XMM2
  5819. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5820. MULPS XMM7, XMM0
  5821. ADDPS XMM7, XMM6
  5822. ; multiply A01(D)*B10(E)
  5823. MOVAPS XMM0, XMM4
  5824. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5825. MULPS XMM0, XMM1
  5826. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5827. MOVAPS XMM6, XMM4
  5828. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5829. MULPS XMM6, XMM1
  5830. ADDPS XMM6, XMM0
  5831. ADDPS XMM7, XMM6
  5832. MOVLPS [RCX], XMM7
  5833. MOVHPS [RCX+16], XMM7
  5834. ;****************************************************;
  5835. ; load A00
  5836. MOVLPS XMM0, [RAX] ; XMM0 := [a00,a01,-,-]
  5837. MOVHPS XMM0, [RAX+16] ; XMM0 := [a00,a01,a10,a11]
  5838. ; load A01
  5839. MOVLPS XMM1, [RAX+8] ; XMM1 := [a02,a03,-,-]
  5840. MOVHPS XMM1, [RAX+24] ; XMM1 := [a02,a03,a12,a13]
  5841. ; multiply A00(D)*B01(E) (use MatMulR2x2 code)
  5842. MOVAPS XMM6, XMM3
  5843. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5844. MULPS XMM6, XMM0
  5845. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5846. MOVAPS XMM7, XMM3
  5847. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5848. MULPS XMM7, XMM0
  5849. ADDPS XMM7, XMM6
  5850. ; multiply A01(D)*B11(E)
  5851. MOVAPS XMM0, XMM5
  5852. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5853. MULPS XMM0, XMM1
  5854. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5855. MOVAPS XMM6, XMM5
  5856. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5857. MULPS XMM6, XMM1
  5858. ADDPS XMM6, XMM0
  5859. ADDPS XMM7, XMM6
  5860. MOVLPS [RCX+8], XMM7
  5861. MOVHPS [RCX+24], XMM7
  5862. ;****************************************************;
  5863. ; load A10
  5864. MOVLPS XMM0, [RAX+32] ; XMM0 := [a20,a21,-,-]
  5865. MOVHPS XMM0, [RAX+48] ; XMM0 := [a20,a21,a30,a31]
  5866. ; load A11
  5867. MOVLPS XMM1, [RAX+40] ; XMM1 := [a22,a23,-,-]
  5868. MOVHPS XMM1, [RAX+56] ; XMM1 := [a22,a23,a32,a33]
  5869. ; multiply A10(D)*B00(E) (use MatMulR2x2 code)
  5870. MOVAPS XMM6, XMM2
  5871. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5872. MULPS XMM6, XMM0
  5873. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5874. MOVAPS XMM7, XMM2
  5875. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5876. MULPS XMM7, XMM0
  5877. ADDPS XMM7, XMM6
  5878. ; multiply A11(D)*B10(E)
  5879. MOVAPS XMM0, XMM4
  5880. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5881. MULPS XMM0, XMM1
  5882. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5883. MOVAPS XMM6, XMM4
  5884. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5885. MULPS XMM6, XMM1
  5886. ADDPS XMM6, XMM0
  5887. ADDPS XMM7, XMM6
  5888. MOVLPS [RCX+32], XMM7
  5889. MOVHPS [RCX+48], XMM7
  5890. ;****************************************************;
  5891. ; load A10
  5892. MOVLPS XMM0, [RAX+32] ; XMM0 := [a20,a21,-,-]
  5893. MOVHPS XMM0, [RAX+48] ; XMM0 := [a20,a21,a30,a31]
  5894. ; load A11
  5895. MOVLPS XMM1, [RAX+40] ; XMM1 := [a22,a23,-,-]
  5896. MOVHPS XMM1, [RAX+56] ; XMM1 := [a22,a23,a32,a33]
  5897. ; multiply A10(D)*B01(E) (use MatMulR2x2 code)
  5898. MOVAPS XMM6, XMM3
  5899. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5900. MULPS XMM6, XMM0
  5901. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5902. MOVAPS XMM7, XMM3
  5903. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5904. MULPS XMM7, XMM0
  5905. ADDPS XMM7, XMM6
  5906. ; multiply A11(D)*B11(E)
  5907. MOVAPS XMM0, XMM5
  5908. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5909. MULPS XMM0, XMM1
  5910. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5911. MOVAPS XMM6, XMM5
  5912. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5913. MULPS XMM6, XMM1
  5914. ADDPS XMM6, XMM0
  5915. ADDPS XMM7, XMM6
  5916. MOVLPS [RCX+40], XMM7
  5917. MOVHPS [RCX+56], XMM7
  5918. END MatMulR4x4;
  5919. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  5920. (* FIXME: speed it up when horizontal add is available!!! *)
  5921. PROCEDURE MatVecMulR2x2(dadr, ladr, radr: ADDRESS);
  5922. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  5923. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  5924. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  5925. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  5926. ; load the whole matrix
  5927. MOVUPS XMM0, [RAX] ; XMM0 := [a00,a01,a10,a11]
  5928. MOVLPS XMM1, [RBX] ; XMM1 := [b00,b01,-,-]
  5929. MOVLHPS XMM1, XMM1 ; XMM1 := [b00,b10,b00,b10]
  5930. MULPS XMM0, XMM1 ; XMM0 := [a00*b00,a01*b10,a10*b00,a11*b10]
  5931. MOVAPS XMM1, XMM0
  5932. SHUFPS XMM0, XMM0, 8; XMM0 := [a00*b00,a10*b00,-,-]
  5933. SHUFPS XMM1, XMM1, 13; XMM1 := [a01*b10,a11*b10,-,-]
  5934. ADDPS XMM0, XMM1
  5935. MOVLPS [RCX], XMM0
  5936. END MatVecMulR2x2;
  5937. (* PH *)
  5938. (* to do: use MOVAPS when Felix fixes issues with alignment *)
  5939. PROCEDURE MatVecMulR4x4(dadr, ladr, radr: ADDRESS);
  5940. CODE{SYSTEM.AMD64, SYSTEM.SSE3}
  5941. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  5942. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  5943. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  5944. MOVUPS XMM0, [RBX] ; XMM0 := [b0,b1,b2,b3]
  5945. MOVUPS XMM1, [RAX] ; XMM1 := [a00,a01,a02,a03]
  5946. MOVUPS XMM2, [RAX+16] ; XMM2 := [a10,a11,a12,a13]
  5947. MOVUPS XMM3, [RAX+32] ; XMM3 := [a20,a21,a22,a23]
  5948. MOVUPS XMM4, [RAX+48] ; XMM4 := [a30,a31,a32,a33]
  5949. MULPS XMM1, XMM0
  5950. MULPS XMM2, XMM0
  5951. HADDPS XMM1, XMM2 ; adjacent pairs are horizontally added
  5952. MULPS XMM3, XMM0
  5953. MULPS XMM4, XMM0
  5954. HADDPS XMM3, XMM4 ; adjacent pairs are horizontally added
  5955. HADDPS XMM1, XMM3 ; adjacent pairs are horizontally added
  5956. MOVUPS [RCX], XMM1
  5957. END MatVecMulR4x4;
  5958. PROCEDURE InstallMatMul*(context: Commands.Context);
  5959. VAR type: LONGINT; string: ARRAY 32 OF CHAR;
  5960. BEGIN
  5961. context.arg.String(string);
  5962. IF string = "dynamic" THEN
  5963. type := cMatMulDynamic;
  5964. ELSIF string = "scalarproduct" THEN
  5965. type := cMatMulScalarProduct
  5966. ELSIF string = "naive" THEN
  5967. type := cMatMulNaive
  5968. ELSIF string = "transposed" THEN
  5969. type := cMatMulTransposed
  5970. ELSIF string = "stride" THEN
  5971. type := cMatMulStride
  5972. ELSIF string ="blocked" THEN
  5973. type := cMatMulBlocked
  5974. ELSE
  5975. KernelLog.String("unknown method: "); KernelLog.String(string); KernelLog.Ln;
  5976. type := cMatMulDynamic;
  5977. END;
  5978. SetMatMulMethod( type );
  5979. END InstallMatMul;
  5980. PROCEDURE InstallAsm*;
  5981. BEGIN
  5982. KernelLog.String( "ASM " );
  5983. ArrayBase.loopSPAXAX := SPAXAXLoopA;
  5984. ArrayBase.loopSPARAR := SPARARLoopA;
  5985. ArrayBase.loopAddAXAX := AddAXAXLoopA;
  5986. ArrayBase.loopAddARAR := AddARARLoopA;
  5987. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
  5988. ArrayBase.loopMatMulARAR := MatMulARARLoopA;
  5989. ArrayBase.loopMulAXSX := MulAXSXLoopA;
  5990. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopA;
  5991. ArrayBase.loopMulARSR := MulARSRLoopA;
  5992. ArrayBase.loopIncMulARSR := IncMulARSRLoopA;
  5993. ArrayBase.loopMatMulIncAXAX := MatMulIncAXAXLoopA;
  5994. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopA;
  5995. ArrayBase.transpose4 := Transpose4;
  5996. ArrayBase.transpose8 := Transpose8;
  5997. END InstallAsm;
  5998. PROCEDURE InstallSSE*;
  5999. BEGIN
  6000. IF Machine.SSESupport THEN
  6001. KernelLog.String( "SSE " );
  6002. ArrayBase.loopSPARAR := SPARARLoopSSE;
  6003. ArrayBase.loopAddARAR := AddARARLoopSSE;
  6004. ArrayBase.loopMulARSR := MulARSRLoopSSE;
  6005. ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
  6006. ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
  6007. ArrayBase.matMulR := MatMulR;
  6008. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopSSE;
  6009. ArrayBase.matMulIncR := MatMulIncR;
  6010. (* optimizations for small matrices (Alexey Morozov) *)
  6011. ArrayBase.matMulR2x2 := MatMulR2x2;
  6012. ArrayBase.matMulR3x3 := MatMulR3x3;
  6013. ArrayBase.matMulR4x4 := MatMulR4x4;
  6014. ArrayBase.matVecMulR2x2 := MatVecMulR2x2;
  6015. END;
  6016. END InstallSSE;
  6017. PROCEDURE InstallSSE2*; (* extra for testing, will be merged with Install in later versions *)
  6018. BEGIN
  6019. IF Machine.SSE2Support THEN
  6020. KernelLog.String( "SSE2 " );
  6021. ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
  6022. ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
  6023. ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
  6024. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
  6025. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;
  6026. ArrayBase.matMulX := MatMulX;
  6027. ArrayBase.loopMatMulIncAXAX :=
  6028. MatMulIncAXAXLoopSSE;
  6029. ArrayBase.matMulIncX := MatMulIncX;
  6030. END;
  6031. END InstallSSE2;
  6032. (*! to do: at current, this only works for Win, not for native because SSE3Support is not yet implemented in I386.Machine.Mod*)
  6033. PROCEDURE InstallSSE3*; (* extra for testing, will be merged with Install in later versions *)
  6034. BEGIN
  6035. IF Machine.SSE3Support THEN
  6036. KernelLog.String( "SSE3 " );
  6037. (* optimizations for small matrices *)
  6038. ArrayBase.matVecMulR4x4 := MatVecMulR4x4;
  6039. END;
  6040. END InstallSSE3;
  6041. PROCEDURE Install*;
  6042. BEGIN
  6043. KernelLog.String( "ArrayBaseOptimized: installing runtime library optimizations:" );
  6044. InstallAsm; InstallSSE; InstallSSE2; InstallSSE3;
  6045. KernelLog.String( " done." ); KernelLog.Ln;
  6046. END Install;
  6047. PROCEDURE SetParameters*( context: Commands.Context );
  6048. BEGIN
  6049. context.arg.SkipWhitespace; context.arg.Int(cBlockSize,TRUE);
  6050. context.arg.SkipWhitespace; context.arg.Int(nrProcesses,TRUE);
  6051. IF nrProcesses > maxProcesses THEN
  6052. nrProcesses := maxProcesses
  6053. ELSIF nrProcesses = 0 THEN nrProcesses := Machine.NumberOfProcessors();
  6054. END;
  6055. KernelLog.String( "BlockSize=" ); KernelLog.Int( cBlockSize, 0 );
  6056. KernelLog.String( ", NrProcesses = " ); KernelLog.Int( nrProcesses, 0 ); KernelLog.Ln;
  6057. END SetParameters;
  6058. BEGIN
  6059. cBlockSize := 0; (* automatic *)
  6060. nrProcesses := Machine.NumberOfProcessors(); (* automatic *)
  6061. allocT := 0; copyT := 0; compT := 0;
  6062. NEW( cachePool );
  6063. END FoxArrayBaseOptimized.
  6064. SystemTools.Free ArrayBaseOptimized ~
  6065. ArrayBaseOptimized.Install ~
  6066. ArrayBaseOptimized.InstallSSE2 ~
  6067. ArrayBaseOptimized.InstallSSE ~
  6068. ArrayBaseOptimized.InstallAsm ~
  6069. ArrayBaseOptimized.InstallMatMul dynamic ~
  6070. ArrayBaseOptimized.InstallMatMul scalarproduct ~
  6071. ArrayBaseOptimized.InstallMatMul transposed ~
  6072. ArrayBaseOptimized.InstallMatMul naive ~
  6073. ArrayBaseOptimized.InstallMatMul stride ~
  6074. ArrayBaseOptimized.InstallMatMul blocked ~
  6075. ArrayBaseOptimized.SetParameters 0 1 ~ (* BlockSize, NrProcesses *)